In [None]:
import torch
import torchtext
from torchtext import data
import torch.optim as optim
import torch.nn as nn
import numpy as np
import argparse
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

In [None]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, vocab, split="train"):
        data_path = "data"
        df = pd.read_csv(os.path.join(data_path, f"{split}.tsv"), sep="\t")

        X, Y = [], []
        V = len(vocab.vectors)
        for i, row in df.iterrows():
            L = row["text"].split()
            X.append(torch.tensor([vocab.stoi.get(w, V-1) for w in L]))  # Use the last word in the vocab as the "out-of-vocabulary" token
            Y.append(row.label)
        self.X = X 
        self.Y = torch.tensor(Y)
        
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx] 

In [None]:
def my_collate_function(batch, device):
    batch_x, batch_y = [], []
    max_len = 0
    for x,y in batch:
        batch_y.append(y)
        max_len = max(max_len, len(x))
    for x,y in batch:
        x_p = torch.concat(
            [x, torch.zeros(max_len - len(x))]
        )
        batch_x.append(x_p)
    return torch.stack(batch_x).t().int().to(device), torch.tensor(batch_y).to(device)

In [None]:
class CNNmodel(torch.nn.Module):
    def __init__(self,vocab,embedding_dim,num_filters,k_size, freeze):
        super(CNNmodel,self).__init__()
        self.embedding = nn.Embedding.from_pretrained(vocab.vectors, freeze=freeze)
        self.conv1 = nn.Conv2d(1,num_filters,kernel_size=(k_size[0],embedding_dim),bias=False)
        self.conv2 = nn.Conv2d(1,num_filters,kernel_size=(k_size[1],embedding_dim),bias=False)
        self.linear1 = nn.Linear(num_filters*2,1)
    def forward(self, x, length,k_size):
        """
        x: torch.tensor of shape (bsz), bsz is the batch size
        """
        out = self.embedding(x)
        out = out.permute(1,0,2).unsqueeze(1)
        out1 = self.conv1(out).squeeze(3)
        out1 = F.relu(out1)
        out1 = F.max_pool1d(out1,int(length-(k_size[0]-1)))
        out2 = self.conv2(out).squeeze(3)
        out2 = F.relu(out2)
        out2 = F.max_pool1d(out2,int(length-(k_size[1]-1)))
        out = torch.concat((out1,out2),1).squeeze()
        logits = self.linear1(out).squeeze(1)
        return logits  

In [None]:
def evaluate(model,batch,criterion, batch_size, k_size):
    model.eval()
    with torch.no_grad():
        e_loss = 0.0
        e_acc = 0.0
        p =0
        for text,label in iter(batch):
            length = len(text)
            logits = model(text,length,k_size)
            label = label.type(torch.float)
            loss = criterion(logits,label)
            acc = ((logits > 0)==label).float().sum() 
            e_acc += (acc/batch_size)
            e_loss = e_loss + float(loss)
            p += 1
        return (e_loss/p), (e_acc.cpu()/p)

In [None]:
def CNN_train(batch_size, epochs, lr, num_filters, embedding_dim, dataset, k_size, freeze):
    torch.manual_seed(2)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print ("Using device:", device)
    glove = torchtext.vocab.GloVe(name="6B",dim=100)

    train_dataset = TextDataset(glove, dataset)
    val_dataset = TextDataset(glove, "validation")
    test_dataset = TextDataset(glove, "test")

    # 3.3.3
    train_dataloader = torch.utils.data.DataLoader(
        dataset=train_dataset, 
        batch_size= batch_size, 
        shuffle=False, 
        collate_fn=lambda batch: my_collate_function(batch, device))

    validation_dataloader = torch.utils.data.DataLoader(
        dataset=val_dataset, 
        batch_size= batch_size, 
        shuffle=False, 
        collate_fn=lambda batch: my_collate_function(batch, device))

    test_dataloader = torch.utils.data.DataLoader(
        dataset=test_dataset,
        batch_size= batch_size,
        shuffle=False,
        collate_fn=lambda batch: my_collate_function(batch, device))


    #model = BaselineModel(glove,embedding_dim)
    model = CNNmodel(glove,embedding_dim,num_filters,k_size, freeze)
    model = model.to(device)
    criterion = torch.nn.BCEWithLogitsLoss()
    opt_func = torch.optim.Adam
    optimizer = opt_func(model.parameters(), lr)       

    train_loss_list = []
    train_acc_list = []
    val_loss_list = []
    val_acc_list = []
    test_loss_list = []
    test_acc_list = []

    epoch_r = np.arange(0,epochs)
    for epoch in range(epochs):
        train_loss= 0.0
        train_acc= 0.0
        model.train()
        n = 0
        for text,label in iter(train_dataloader): 
            length = len(text)
            logits = model(text,length,k_size)
            label = label.type(torch.float)
            loss = criterion(logits,label)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            acc = ((logits > 0)==label).float().sum()
            train_acc += (acc/batch_size)
            train_loss = train_loss + float(loss)
            n += 1
        train_loss_list.append(train_loss/n)
        train_acc_list.append(train_acc.cpu()/n)

        val = evaluate(model,validation_dataloader, criterion, batch_size, k_size)
        val_loss_list.append(val[0])
        val_acc_list.append(val[1])

        print("Epoch: [{}]| Train acc:  {:.4f} | Train loss:  {:.4f} |  Valid acc:  {:.4f} |  Valid loss:  {:.4f} "
          .format(epoch + 1, train_acc_list[epoch], train_loss_list[epoch],val_acc_list[epoch],
                  val_loss_list[epoch]))
        
    test = evaluate(model,test_dataloader, criterion, batch_size, k_size)
    test_loss_list.append(test[0])
    test_acc_list.append(test[1])
    print('Test Accuracy:', test_acc_list[-1])
        
    plt.title("Loss Curve")
    plt.plot(epoch_r, train_loss_list, label="Train")
    plt.plot(epoch_r, val_loss_list, label="Validation")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()
    
    plt.title("Accuracy Curve")
    plt.plot(epoch_r, train_acc_list, label="Train")
    plt.plot(epoch_r, val_acc_list, label="Validation")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()
    
    return model

In [None]:
def CNN_overfit(batch_size, epochs, lr, num_filters, embedding_dim, dataset, k_size, freeze):
    torch.manual_seed(2)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print ("Using device:", device)
    glove = torchtext.vocab.GloVe(name="6B",dim=100)

    train_dataset = TextDataset(glove, dataset)

    # 3.3.3
    train_dataloader = torch.utils.data.DataLoader(
        dataset=train_dataset, 
        batch_size= batch_size, 
        shuffle=False, 
        collate_fn=lambda batch: my_collate_function(batch, device))


    #model = BaselineModel(glove,embedding_dim)
    model = CNNmodel(glove,embedding_dim,num_filters,k_size, freeze)
    model = model.to(device)
    criterion = torch.nn.BCEWithLogitsLoss()
    opt_func = torch.optim.Adam
    optimizer = opt_func(model.parameters(), lr)       

    train_loss_list = []
    train_acc_list = []

    epoch_r = np.arange(0,epochs)
    for epoch in range(epochs):
        train_loss= 0.0
        train_acc= 0.0
        model.train()
        n = 0
        for text,label in iter(train_dataloader): 
            length = len(text)
            logits = model(text,length,k_size)
            label = label.type(torch.float)
            loss = criterion(logits,label)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            acc = ((logits > 0)==label).float().sum()
            train_acc += (acc/batch_size)
            train_loss = train_loss + float(loss)
            n += 1
        train_loss_list.append(train_loss/n)
        train_acc_list.append(train_acc.cpu()/n)

        print("Epoch: [{}]| Train acc:  {:.4f} | Train loss:  {:.4f} "
          .format(epoch + 1, train_acc_list[epoch], train_loss_list[epoch]))
        
    plt.title("Loss Curve")
    plt.plot(epoch_r, train_loss_list, label="Train")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()
    
    plt.title("Accuracy Curve")
    plt.plot(epoch_r, train_acc_list, label="Train")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()
    
    return model

In [None]:
overfit_model= CNN_overfit(batch_size = 2, epochs = 15, lr = 0.001, num_filters = 50, embedding_dim = 100, 
                         dataset = "overfit", k_size=[2,4], freeze=True)

In [None]:
model= CNN_train(batch_size = 32, epochs = 15, lr = 0.0005, num_filters = 50, embedding_dim = 100, 
                 dataset = "train", k_size=[2,4], freeze = True)

In [None]:
#fine tuning embeddings by setting freeze parameter to False
model= CNN_train(batch_size = 32, epochs = 15, lr = 0.0005, num_filters = 50, embedding_dim = 100, 
                 dataset = "train", k_size=[2,4],freeze =False)

In [None]:
glove = torchtext.vocab.GloVe(name="6B",dim=100)

In [None]:
a = model.conv1.weight.squeeze().detach().cpu()

In [None]:
b = torch.mean(a,1)

In [None]:
avg_conv1 = torch.mean(b,0)

In [None]:
d = model.conv2.weight.squeeze().detach().cpu()

In [None]:
e = torch.mean(d,1)

In [None]:
avg_conv2 = torch.mean(e,0)

In [None]:
def print_closest_cosine_words(vec, n=5):
    cos = torch.nn.CosineSimilarity(dim=2)
    dists = cos(glove.vectors.unsqueeze(0), vec.unsqueeze(0))
    dists = dists.squeeze(0)
    lst = sorted(enumerate(dists.numpy()), key=lambda x: x[1], reverse = True) # sort by distance
    for idx, difference in lst[1:n+1]:                         # take the top n
        print(glove.itos[idx], "\t%5.2f" % difference)

In [None]:
print_closest_cosine_words(avg_conv1)

In [None]:
for item in b:
    print_closest_cosine_words(item)
    print('\n')

In [None]:
print_closest_cosine_words(avg_conv2)

In [None]:
for item in e:
    print_closest_cosine_words(item)
    print('\n')

In [None]:
torch.save(model.state_dict(), 'model_cnn.pt')