In [1]:
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset
from torch.optim import Adam
from tqdm import tqdm
from torch import nn
import wandb
import spacy
from torch.nn.utils.rnn import pad_sequence

def create_emb_matrix(embedding_dim=100):
    glove = pd.read_csv(f'data/glove/glove.6B.{embedding_dim}d.txt', sep=" ", quoting=3, header=None, index_col=0)
    vocab = {'<pad>': 0, '<unk>': 1}
    embeddings = np.zeros((len(glove) + 2, embedding_dim))
    embeddings[0] = np.zeros(embedding_dim)
    embeddings[1] = np.zeros(embedding_dim)

    for index, (key, val) in tqdm(enumerate(glove.T.items())):
        vocab[key] = index + 2
        embeddings[index+2] = val.to_numpy()

    return vocab, embeddings
    

class HeadDataset(Dataset):
    def __init__(self, df, vocab):
        nlp = spacy.load("en_core_web_sm")
        self.labels = df['label'].to_numpy()
        self.texts = pad_sequence([torch.tensor([vocab.get(token.text, 1) for token in nlp(text)], dtype=torch.int) for text in df['text']], batch_first=True)

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]



class MaxPool(nn.Module):
    def forward(self, X):
        values, _ = torch.max(X, dim=1)
        return values


class AvgPool(nn.Module):
    def forward(self, X):
        return torch.mean(X, dim=1)


class SWEMClassifier(nn.Module):

    def __init__(self, num_classes=3, pooling="max", embedding_matrix=None, freeze_emb=True):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=embedding_matrix.shape[0],
                                      embedding_dim=embedding_matrix.shape[1]).from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), freeze=freeze_emb)
        self.pool = MaxPool() if pooling == "max" else AvgPool()
        self.linear = nn.Linear(embedding_matrix.shape[1], num_classes)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, X):
        outputs = self.embedding(X)
        outputs = self.pool(outputs)
        outputs = self.linear(outputs)
        outputs = self.softmax(outputs)

        return outputs
    
    def save_pretrained(self, path):
        torch.save(self, path)


def train(model, train_dataset, val_dataset, learning_rate=1e-3, epochs=10, batch_size=8):
    # wandb.init(project="kogito-relation-matcher", config={"learning_rate": learning_rate, "epochs": epochs, "batch_size": batch_size})

    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.NLLLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        print("Using CUDA")

    model = model.to(device)
    criterion = criterion.to(device)

    for epoch_num in range(epochs):

        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            model.zero_grad()

            train_label = train_label.to(device)
            X = train_input.to(device)

            output = model(X)
            
            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()
            
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            batch_loss.backward()
            optimizer.step()
        
        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():

            for val_input, val_label in val_dataloader:

                val_label = val_label.to(device)
                X = val_input.to(device)

                output = model(X)

                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()
                
                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        
        train_loss = total_loss_train / len(train_dataset)
        train_acc = total_acc_train / len(train_dataset)
        val_loss = total_loss_val / len(val_dataset)
        val_acc = total_acc_val / len(val_dataset)

        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {train_loss: .3f} \
            | Train Accuracy: {train_acc: .3f} \
            | Val Loss: {val_loss: .3f} \
            | Val Accuracy: {val_acc: .3f}')
        
        # wandb.log({"train_loss": train_loss, "train_accuracy": train_acc, "val_loss": val_loss, "val_accuracy": val_acc})
        # model.save_pretrained(f"./models/checkpoint_{epoch_num}.pth")

  from .autonotebook import tqdm as notebook_tqdm
2022-03-22 01:11:18.020344: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-03-22 01:11:18.020382: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
vocab, emb_matrix = create_emb_matrix()

400000it [00:11, 35416.62it/s]


In [3]:
from relation_modeling_utils import load_data

train_df = load_data("data/atomic2020_data-feb2021/train.tsv")
dev_df = load_data("data/atomic2020_data-feb2021/dev.tsv")
train_data = HeadDataset(train_df, vocab=vocab)
val_data = HeadDataset(dev_df, vocab=vocab)

In [27]:
model = SWEMClassifier(embedding_matrix=emb_matrix, pooling="max", freeze_emb=False)
train(model=model, train_dataset=train_data, val_dataset=val_data, epochs=20, batch_size=128, learning_rate=1e-4)

Using CUDA


100%|██████████| 289/289 [00:04<00:00, 61.24it/s]


Epochs: 1 | Train Loss:  0.007             | Train Accuracy:  0.610             | Val Loss:  0.005             | Val Accuracy:  0.831


100%|██████████| 289/289 [00:04<00:00, 61.06it/s]


Epochs: 2 | Train Loss:  0.005             | Train Accuracy:  0.794             | Val Loss:  0.004             | Val Accuracy:  0.834


100%|██████████| 289/289 [00:04<00:00, 60.79it/s]


Epochs: 3 | Train Loss:  0.004             | Train Accuracy:  0.823             | Val Loss:  0.004             | Val Accuracy:  0.846


100%|██████████| 289/289 [00:04<00:00, 60.82it/s]


Epochs: 4 | Train Loss:  0.004             | Train Accuracy:  0.838             | Val Loss:  0.003             | Val Accuracy:  0.852


100%|██████████| 289/289 [00:04<00:00, 60.92it/s]


Epochs: 5 | Train Loss:  0.003             | Train Accuracy:  0.846             | Val Loss:  0.003             | Val Accuracy:  0.854


100%|██████████| 289/289 [00:04<00:00, 60.97it/s]


Epochs: 6 | Train Loss:  0.003             | Train Accuracy:  0.850             | Val Loss:  0.003             | Val Accuracy:  0.857


100%|██████████| 289/289 [00:04<00:00, 61.54it/s]


Epochs: 7 | Train Loss:  0.003             | Train Accuracy:  0.853             | Val Loss:  0.003             | Val Accuracy:  0.861


100%|██████████| 289/289 [00:04<00:00, 61.56it/s]


Epochs: 8 | Train Loss:  0.003             | Train Accuracy:  0.855             | Val Loss:  0.003             | Val Accuracy:  0.861


100%|██████████| 289/289 [00:04<00:00, 61.75it/s]


Epochs: 9 | Train Loss:  0.003             | Train Accuracy:  0.856             | Val Loss:  0.003             | Val Accuracy:  0.861


100%|██████████| 289/289 [00:04<00:00, 61.73it/s]


Epochs: 10 | Train Loss:  0.003             | Train Accuracy:  0.858             | Val Loss:  0.003             | Val Accuracy:  0.861


100%|██████████| 289/289 [00:04<00:00, 61.65it/s]


Epochs: 11 | Train Loss:  0.003             | Train Accuracy:  0.859             | Val Loss:  0.003             | Val Accuracy:  0.863


100%|██████████| 289/289 [00:04<00:00, 61.33it/s]


Epochs: 12 | Train Loss:  0.003             | Train Accuracy:  0.860             | Val Loss:  0.003             | Val Accuracy:  0.863


100%|██████████| 289/289 [00:04<00:00, 61.13it/s]


Epochs: 13 | Train Loss:  0.003             | Train Accuracy:  0.861             | Val Loss:  0.003             | Val Accuracy:  0.863


100%|██████████| 289/289 [00:04<00:00, 61.02it/s]


Epochs: 14 | Train Loss:  0.003             | Train Accuracy:  0.862             | Val Loss:  0.003             | Val Accuracy:  0.859


100%|██████████| 289/289 [00:04<00:00, 60.72it/s]


Epochs: 15 | Train Loss:  0.003             | Train Accuracy:  0.864             | Val Loss:  0.003             | Val Accuracy:  0.863


100%|██████████| 289/289 [00:04<00:00, 62.89it/s]


Epochs: 16 | Train Loss:  0.003             | Train Accuracy:  0.866             | Val Loss:  0.003             | Val Accuracy:  0.863


100%|██████████| 289/289 [00:04<00:00, 62.87it/s]


Epochs: 17 | Train Loss:  0.003             | Train Accuracy:  0.867             | Val Loss:  0.003             | Val Accuracy:  0.864


100%|██████████| 289/289 [00:04<00:00, 62.89it/s]


Epochs: 18 | Train Loss:  0.003             | Train Accuracy:  0.869             | Val Loss:  0.003             | Val Accuracy:  0.862


100%|██████████| 289/289 [00:04<00:00, 62.89it/s]


Epochs: 19 | Train Loss:  0.003             | Train Accuracy:  0.870             | Val Loss:  0.003             | Val Accuracy:  0.857


100%|██████████| 289/289 [00:04<00:00, 62.91it/s]

Epochs: 20 | Train Loss:  0.003             | Train Accuracy:  0.871             | Val Loss:  0.003             | Val Accuracy:  0.857





In [28]:
from torchmetrics import Accuracy, Precision, Recall, F1Score

def evaluate(val_dataset):
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=len(val_dataset))
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    model.to(device)

    with torch.no_grad():
        val_input, val_label = next(iter(val_dataloader))
        X = val_input.to(device)
        # val_label = val_label.to(device)
        outputs = model(X)
        preds = outputs.argmax(dim=1).detach().cpu()
        accuracy = Accuracy()(preds, val_label)
        precision = Precision(num_classes=3, average="weighted")(preds, val_label)
        recall = Recall(num_classes=3, average="weighted")(preds, val_label)
        f1score = F1Score(num_classes=3, average="weighted")(preds, val_label)
    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1_score": f1score}

In [29]:
evaluate(val_data)

{'accuracy': tensor(0.8569),
 'precision': tensor(0.7950),
 'recall': tensor(0.8569),
 'f1_score': tensor(0.8129)}