In [14]:
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset
from torch.optim import Adam
from tqdm import tqdm
from torch import nn
import wandb
import spacy

def create_emb_matrix(embedding_dim=100):
    glove = pd.read_csv(f'data/glove/glove.6B.{embedding_dim}d.txt', sep=" ", quoting=3, header=None, index_col=0)
    vocab = {'<pad>': 0, '<unk>': 1}
    embeddings = np.zeros((len(glove) + 2, embedding_dim))
    embeddings[0] = np.zeros(embedding_dim)
    embeddings[1] = np.zeros(embedding_dim)

    for index, (key, val) in tqdm(enumerate(glove.T.items())):
        vocab[key] = index + 2
        embeddings[index+2] = val.to_numpy()

    return vocab, embeddings

class HeadDataset(Dataset):
    def __init__(self, df, vocab, embedding_matrix, pooling="avg"):
        nlp = spacy.load("en_core_web_sm")
        self.labels = []
        self.embeddings = []

        for index, text in enumerate(df['text']):
            doc = nlp(text)
            vectors = []
            for token in doc:
                if token.text in vocab:
                    vectors.append(embedding_matrix[vocab[token.text]])
            
            if vectors:
                if pooling == "max":
                    self.embeddings.append(np.amax(np.array(vectors, dtype=np.float32), axis=0))
                else:
                    self.embeddings.append(np.mean(vectors, axis=0, dtype=np.float32))
                self.labels.append(df['label'][index])

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]



class MaxPool(nn.Module):
    def forward(self, X):
        values, _ = torch.max(X, dim=1)
        return values


class AvgPool(nn.Module):
    def forward(self, X):
        return torch.mean(X, dim=1)


class SWEM2Classifier(nn.Module):

    def __init__(self, hidden_dim=128, num_classes=3, embedding_dim=100):
        super().__init__()
        self.linear1 = nn.Linear(embedding_dim, hidden_dim)
        self.activation1 = nn.ReLU()
        self.linear2 = nn.Linear(hidden_dim, num_classes)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, X):
        outputs = self.linear1(X)
        outputs = self.activation1(outputs)
        outputs = self.linear2(outputs)
        outputs = self.softmax(outputs)
        return outputs
    
    def save_pretrained(self, path):
        torch.save(self, path)


def train(model, train_dataset, val_dataset, learning_rate=1e-3, epochs=10, batch_size=8):
    # wandb.init(project="kogito-relation-matcher", config={"learning_rate": learning_rate, "epochs": epochs, "batch_size": batch_size})

    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.NLLLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        print("Using CUDA")

    model = model.to(device)
    criterion = criterion.to(device)

    for epoch_num in range(epochs):

        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            model.zero_grad()

            train_label = train_label.to(device)
            X = train_input.to(device)

            output = model(X)
            
            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()
            
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            batch_loss.backward()
            optimizer.step()
        
        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():

            for val_input, val_label in val_dataloader:

                val_label = val_label.to(device)
                X = val_input.to(device)

                output = model(X)

                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()
                
                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        
        train_loss = total_loss_train / len(train_dataset)
        train_acc = total_acc_train / len(train_dataset)
        val_loss = total_loss_val / len(val_dataset)
        val_acc = total_acc_val / len(val_dataset)

        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {train_loss: .3f} \
            | Train Accuracy: {train_acc: .3f} \
            | Val Loss: {val_loss: .3f} \
            | Val Accuracy: {val_acc: .3f}')
        
        # wandb.log({"train_loss": train_loss, "train_accuracy": train_acc, "val_loss": val_loss, "val_accuracy": val_acc})
        # model.save_pretrained(f"./models/checkpoint_{epoch_num}.pth")

In [3]:
vocab, emb_matrix = create_emb_matrix()

400000it [00:11, 33520.48it/s]


In [4]:
from relation_modeling_utils import load_data

train_df = load_data("data/atomic2020_data-feb2021/train.tsv")
dev_df = load_data("data/atomic2020_data-feb2021/dev.tsv")
train_data = HeadDataset(train_df, vocab, emb_matrix)
val_data = HeadDataset(dev_df, vocab, emb_matrix)

In [11]:
train_data.embeddings = np.array(train_data.embeddings, dtype=np.float32)
val_data.embeddings = np.array(val_data.embeddings, dtype=np.float32)

In [15]:
model = SWEM2Classifier()
train(model=model, train_dataset=train_data, val_dataset=val_data, epochs=20, batch_size=128, learning_rate=1e-4)

Using CUDA


  2%|▏         | 8/415 [03:47<3:12:57, 28.45s/it]  


KeyboardInterrupt: 