In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
VOCAB, EMBEDDING_MATRIX = np.load("data/vocab_glove_100d.npy", allow_pickle=True).item(), np.load("data/embedding_matrix_glove_100d.npy", allow_pickle=True)

In [None]:
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch import nn
import pytorch_lightning as pl
import torchmetrics
import torch.nn.functional as F

class SWEMNNClassifier(pl.LightningModule):
    def __init__(self, num_classes=3, hidden_dim=64, embedding_dim=100, learning_rate=1e-3):
        super().__init__()
        self.linear1 = nn.Linear(embedding_dim, hidden_dim)
        self.activation = nn.LeakyReLU()
        self.linear2 = nn.Linear(hidden_dim, num_classes)
        self.model = nn.Sequential(self.linear1, self.activation, self.linear2)
        self.criterion = nn.BCEWithLogitsLoss()
        self.learning_rate = learning_rate
        self.train_accuracy = torchmetrics.Accuracy()
        self.val_accuracy = torchmetrics.Accuracy()
        self.train_precision = torchmetrics.Precision(num_classes=3, average='weighted')
        self.val_precision = torchmetrics.Precision(num_classes=3, average='weighted')
        self.train_recall = torchmetrics.Recall(num_classes=3, average='weighted')
        self.val_recall = torchmetrics.Recall(num_classes=3, average='weighted')
        self.save_hyperparameters()
    
    def forward(self, X):
        outputs = self.model(X)
        probs = F.sigmoid(outputs)
        return probs
    
    def training_step(self, batch, batch_idx):
        X, y = batch
        outputs = self.model(X)
        train_loss = self.criterion(outputs, y.float())
        preds = self.forward(X)
        self.train_accuracy(preds, y)
        self.train_precision(preds, y)
        self.train_recall(preds, y)
        self.log("train_loss", train_loss, on_epoch=True)
        self.log('train_accuracy', self.train_accuracy, on_epoch=True)
        self.log('train_precision', self.train_precision, on_epoch=True)
        self.log('train_recall', self.train_recall, on_epoch=True)
        return train_loss
    
    def validation_step(self, batch, batch_idx):
        X, y = batch
        outputs = self.model(X)
        val_loss = self.criterion(outputs, y.float())
        preds = self.forward(X)
        self.val_accuracy(preds, y)
        self.val_precision(preds, y)
        self.val_recall(preds, y)
        self.log("val_loss", val_loss, on_epoch=True)
        self.log('val_accuracy', self.val_accuracy, on_epoch=True)
        self.log('val_precision', self.val_precision, on_epoch=True)
        self.log('val_recall', self.val_recall, on_epoch=True)
        return val_loss

    def configure_optimizers(self):
        optimizer = Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

In [None]:
train_data = torch.load("data/head_train_multi_no_pad_max.pt")
val_data = torch.load("data/head_train_multi_no_pad_max.pt")

In [None]:
train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=128)

In [None]:
from pytorch_lightning.loggers import WandbLogger
import wandb

wandb_logger = WandbLogger(project="kogito-relation-matcher", name="swem_multi_label_nn")
model = SWEMNNClassifier(learning_rate=1e-4)
trainer = pl.Trainer(max_epochs=20, logger=wandb_logger)
trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)
wandb.finish()

In [9]:
from relation_modeling_utils import load_data, HeadDataset

test_df = load_data("data/atomic2020_data-feb2021/test.tsv", multi_label=True)
test_data = HeadDataset(test_df, vocab=VOCAB, embedding_matrix=EMBEDDING_MATRIX, apply_pooling=True, pooling="max")
test_dataloader = DataLoader(test_data, batch_size=len(test_data))

In [11]:
X, y = next(iter(test_dataloader))

In [17]:
import torchmetrics
preds = model.forward(X)
test_accuracy = torchmetrics.Accuracy()
test_precision = torchmetrics.Precision(num_classes=3, average="weighted")
test_recall = torchmetrics.Recall(num_classes=3, average="weighted")
test_accuracy(preds, y), test_precision(preds, y), test_recall(preds, y)



(tensor(0.8430), tensor(0.8180), tensor(0.9353))

In [25]:
torch.save(model, "swem_nn_model.bin")

In [38]:
from relation_modeling_utils import text_to_embedding

text = "death is a bad event"
embedding = text_to_embedding(text, vocab=VOCAB, embedding_matrix=EMBEDDING_MATRIX)
text_pred = model.forward(torch.tensor(embedding).view((1, -1)))
text_pred



tensor([[0.0493, 0.7683, 0.8267]], grad_fn=<SigmoidBackward0>)

In [18]:
probs = F.softmax(preds, dim=1)

In [19]:
probs

tensor([[0.1761, 0.3708, 0.4531],
        [0.1788, 0.3639, 0.4573],
        [0.1730, 0.3661, 0.4609],
        ...,
        [0.5755, 0.2124, 0.2122],
        [0.5757, 0.2122, 0.2121],
        [0.5759, 0.2121, 0.2120]], grad_fn=<SoftmaxBackward0>)

In [37]:
x = torch.tensor([[0.1, 0.2, 0.5], [0.9, 0.05, 0.05], [0.3, 0.3, 0.4], [0.1, 0.2, 0.7]])
y = torch.tensor([[0, 1, 1], [1, 0, 0], [0, 0, 1], [1, 0, 1]])
accuracy = torchmetrics.Accuracy()
accuracy(x, y)

tensor(0.7500)

In [10]:
trainer = pl.Trainer()
preds = trainer.predict(model, test_dataloader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

TypeError: linear(): argument 'input' (position 1) must be Tensor, not list