In [None]:
from tqdm import tqdm

def create_emb_matrix(embedding_dim=100):
    glove = pd.read_csv(f'data/glove/glove.6B.{embedding_dim}d.txt', sep=" ", quoting=3, header=None, index_col=0)
    vocab = {'<pad>': 0, '<unk>': 1}
    embeddings = np.zeros((len(glove) + 2, embedding_dim))
    embeddings[0] = np.zeros(embedding_dim)
    embeddings[1] = np.zeros(embedding_dim)

    for index, (key, val) in tqdm(enumerate(glove.T.items())):
        vocab[key] = index + 2
        embeddings[index+2] = val.to_numpy()

    return vocab, embeddings

In [None]:
vocab, emb_matrix = create_emb_matrix()

In [75]:
from kogito.core.relation import PHYSICAL_RELATIONS, SOCIAL_RELATIONS, EVENT_RELATIONS
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset
from torch.optim import Adam
from tqdm import tqdm
from torch import nn
from torchtext.vocab import GloVe
import wandb
import spacy
from torch.nn.utils.rnn import pad_sequence

def load_data(datapath):
    data = []
    head_label_set = set()

    with open(datapath) as f:
        for line in f:
            try:
                head, relation, _ = line.split('\t')

                label = 0 

                if relation in EVENT_RELATIONS:
                    label = 1
                elif relation in SOCIAL_RELATIONS:
                    label = 2

                if (head, label) not in head_label_set:
                    data.append((head, label))
                    head_label_set.add((head, label))
            except:
                pass

    return pd.DataFrame(data, columns=['text', 'label'])
    

class HeadDataset(Dataset):
    def __init__(self, df, vocab):
        nlp = spacy.load("en_core_web_sm")
        self.labels = df['label'].to_numpy()
        self.texts = pad_sequence([torch.tensor([vocab.get(token.text, 1) for token in nlp(text)], dtype=torch.int) for text in df['text']], batch_first=True)

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]



class MaxPool(nn.Module):
    def forward(self, X):
        values, _ = torch.max(X, dim=1)
        return values


class AvgPool(nn.Module):
    def forward(self, X):
        return torch.mean(X, dim=1)


class SWEMClassifier(nn.Module):

    def __init__(self, hidden_dim=128, num_classes=3, pooling="max", embedding_matrix=None, freeze_emb=True):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=embedding_matrix.shape[0],
                                      embedding_dim=embedding_matrix.shape[1]).from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), freeze=freeze_emb)
        self.pool = MaxPool() if pooling == "max" else AvgPool()
        self.linear1 = nn.Linear(embedding_matrix.shape[1], hidden_dim)
        self.activation1 = nn.ReLU()
        self.linear2 = nn.Linear(hidden_dim, num_classes)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, X):
        outputs = self.embedding(X)
        outputs = self.pool(outputs)
        outputs = self.linear1(outputs)
        outputs = self.activation1(outputs)
        outputs = self.linear2(outputs)
        outputs = self.softmax(outputs)

        return outputs
    
    def save_pretrained(self, path):
        torch.save(self, path)


def train(model, train_dataset, val_dataset, learning_rate=1e-3, epochs=10, batch_size=8):
    # wandb.init(project="kogito-relation-matcher", config={"learning_rate": learning_rate, "epochs": epochs, "batch_size": batch_size})

    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.NLLLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        print("Using CUDA")
        model = model.to(device)
        criterion = criterion.to(device)

    for epoch_num in range(epochs):

        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            model.zero_grad()

            train_label = train_label.to(device)
            X = train_input.to(device)

            output = model(X)
            
            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()
            
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            batch_loss.backward()
            optimizer.step()
        
        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():

            for val_input, val_label in val_dataloader:

                val_label = val_label.to(device)
                X = val_input.to(device)

                output = model(X)

                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()
                
                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        
        train_loss = total_loss_train / len(train_data)
        train_acc = total_acc_train / len(train_data)
        val_loss = total_loss_val / len(val_data)
        val_acc = total_acc_val / len(val_data)

        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {train_loss: .3f} \
            | Train Accuracy: {train_acc: .3f} \
            | Val Loss: {val_loss: .3f} \
            | Val Accuracy: {val_acc: .3f}')
        
        # wandb.log({"train_loss": train_loss, "train_accuracy": train_acc, "val_loss": val_loss, "val_accuracy": val_acc})
        # model.save_pretrained(f"./models/checkpoint_{epoch_num}.pth")

In [65]:
train_df = load_data("data/atomic2020_data-feb2021/train.tsv")
dev_df = load_data("data/atomic2020_data-feb2021/dev.tsv")
train_data = HeadDataset(train_df, vocab=vocab)
val_data = HeadDataset(dev_df, vocab=vocab)
# model.save_pretrained("./models/final_model.pth")

100%|██████████| 843/843 [00:01<00:00, 580.67it/s]


Epochs: 1 | Train Loss:  0.014             | Train Accuracy:  0.563             | Val Loss:  0.014             | Val Accuracy:  0.502


100%|██████████| 843/843 [00:01<00:00, 574.14it/s]


Epochs: 2 | Train Loss:  0.013             | Train Accuracy:  0.594             | Val Loss:  0.014             | Val Accuracy:  0.502


In [76]:
model = SWEMClassifier(embedding_matrix=emb_matrix, pooling="avg", freeze_emb=False)
train(model=model, train_dataset=train_data, val_dataset=val_data, epochs=20, batch_size=64)

100%|██████████| 843/843 [05:32<00:00,  2.54it/s]


Epochs: 1 | Train Loss:  0.010             | Train Accuracy:  0.628             | Val Loss:  0.011             | Val Accuracy:  0.531


  5%|▌         | 46/843 [00:18<05:13,  2.54it/s]

In [70]:
X, y = train_data[0]
X

tensor([    1, 30417,    99,    99,    99,  7505,     0,     0,     0,     0,
            0,     0,     0,     0,     0], dtype=torch.int32)

In [71]:
emb_matrix[30417]

array([ 0.22953  ,  0.35885  ,  0.58239  ,  0.25259  , -0.30344  ,
        0.0051236, -0.28178  ,  0.41135  ,  0.47261  ,  0.82356  ,
        0.23426  ,  1.0476   , -0.291    ,  0.047954 , -0.09221  ,
        0.10336  , -0.14871  ,  0.055016 ,  0.53798  ,  0.066848 ,
       -0.40651  , -0.26202  ,  0.063933 ,  0.05969  ,  0.003493 ,
        0.79334  , -0.95705  , -0.40116  ,  0.1664   , -0.38669  ,
       -0.55154  ,  1.2724   , -0.36677  ,  0.020613 , -0.33584  ,
       -0.018895 , -0.094397 , -1.183    , -0.25509  , -0.071633 ,
       -0.597    ,  0.41676  ,  0.44958  ,  0.17139  , -0.25743  ,
        0.18268  , -0.42038  ,  0.49184  ,  0.26618  ,  0.20252  ,
        0.03696  ,  0.36088  ,  0.34116  ,  0.226    , -0.01369  ,
       -0.35573  , -0.023686 , -0.21707  , -0.18331  , -0.74595  ,
        0.4819   , -0.45127  ,  0.40188  , -0.16615  , -0.0043198,
       -0.15258  , -0.026358 ,  0.45227  , -0.44518  ,  0.303    ,
        0.005681 , -0.55374  , -0.73395  , -0.17064  , -0.0750

In [73]:
out = model.embedding(X)
out[1]

tensor([ 0.2295,  0.3589,  0.5824,  0.2526, -0.3034,  0.0051, -0.2818,  0.4114,
         0.4726,  0.8236,  0.2343,  1.0476, -0.2910,  0.0480, -0.0922,  0.1034,
        -0.1487,  0.0550,  0.5380,  0.0668, -0.4065, -0.2620,  0.0639,  0.0597,
         0.0035,  0.7933, -0.9571, -0.4012,  0.1664, -0.3867, -0.5515,  1.2724,
        -0.3668,  0.0206, -0.3358, -0.0189, -0.0944, -1.1830, -0.2551, -0.0716,
        -0.5970,  0.4168,  0.4496,  0.1714, -0.2574,  0.1827, -0.4204,  0.4918,
         0.2662,  0.2025,  0.0370,  0.3609,  0.3412,  0.2260, -0.0137, -0.3557,
        -0.0237, -0.2171, -0.1833, -0.7459,  0.4819, -0.4513,  0.4019, -0.1662,
        -0.0043, -0.1526, -0.0264,  0.4523, -0.4452,  0.3030,  0.0057, -0.5537,
        -0.7340, -0.1706, -0.0751, -0.2147, -0.2861, -0.5471, -0.4599, -0.0694,
        -0.4348,  0.1473,  0.1094,  1.0290,  0.0571,  0.0772,  0.1650,  0.4730,
         0.1467, -0.0566, -0.3175,  0.2064,  0.2928,  0.1097, -0.2927, -0.2505,
        -0.1861,  0.1724, -0.0037,  0.35