In [1]:
import sys
sys.path.insert(0, 'utils/')

import utils, unsupervised, supervised, siamese

In [0]:
import json
import pandas as pd
import numpy as np

from gensim.models.wrappers import FastText
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
import re

In [0]:
elmo_embs = utils.load_pkl('embeddings/elmo_embs_isYearFalse.pickle')
fasttext_embs = utils.load_pkl('embeddings/allNoYearFastText.pickle')

## Unsupervised approach

In [0]:
from sklearn.metrics import accuracy_score, classification_report

In [15]:
y, _ = unsupervised.similarity_predict(fasttext_embs, distance='cosine_similarity')

100%|██████████| 3060/3060 [00:02<00:00, 1372.21it/s]


**Constant prediction** 

In [16]:
print('First answer: ', round(accuracy_score(y, np.zeros(len(y))), 3))
print('Second answer: ', round(accuracy_score(y, np.ones(len(y))), 3))
print('Third answer: ', round(accuracy_score(y, np.ones(len(y))*2), 3))
print('Fourth answer: ', round(accuracy_score(y, np.ones(len(y))*3), 3))

First answer:  0.258
Second answer:  0.281
Third answer:  0.25
Fourth answer:  0.211


**Random prediction**

In [17]:
print('Average random prediction: ', round(np.mean([accuracy_score(y, np.random.randint(0, 4, len(y))) for _ in range(100)]), 3))

Average random prediction:  0.25


**FastText**

In [19]:
print('\n \n FastText unsupervised approach score: ', 
      round(accuracy_score(*unsupervised.similarity_predict(fasttext_embs)), 3))

100%|██████████| 3060/3060 [00:02<00:00, 1288.55it/s]


 
 FastText unsupervised approach score:  0.272





In [20]:
y, y_fasttext = unsupervised.similarity_predict(fasttext_embs, topk=4)

100%|██████████| 3060/3060 [00:02<00:00, 1200.39it/s]


In [21]:
unsupervised.topk_accuracy(y, y_fasttext)

0.5415032679738562

In [22]:
unsupervised.topk_accuracy(y, y_fasttext, k=3)

0.7813725490196078

**ELMo**

In [24]:
print('\n \n ELMo unsupervised approach score: ', 
      round(accuracy_score(*unsupervised.similarity_predict(elmo_embs, distance='cosine_similarity')), 3))

100%|██████████| 3060/3060 [00:02<00:00, 1243.54it/s]


 
 ELMo unsupervised approach score:  0.325





In [6]:
y, y_elmo = unsupervised.similarity_predict(elmo_embs, topk=4)

100%|██████████| 3060/3060 [00:03<00:00, 903.07it/s]


In [26]:
unsupervised.topk_accuracy(y, y_elmo)

0.5852941176470589

In [27]:
unsupervised.topk_accuracy(y, y_elmo, k=3)

0.8111111111111111

## Supervised approach

In [0]:
np.random.seed(42)
idx = np.random.choice(list(range(len(elmo_embs))), 
                       size=len(elmo_embs), replace=False)

elmo_train, y_train = np.array(elmo_embs)[idx[:2300]], np.array(y)[idx[:2300]]
elmo_test, y_test = np.array(elmo_embs)[idx[2300:]], np.array(y)[idx[2300:]]

In [10]:
X_elmo, y_elmo = supervised.prepare_for_supervised(elmo_train)
X_fasttext, y_fasttext = supervised.prepare_for_supervised(fasttext_embs)

100%|██████████| 2300/2300 [00:00<00:00, 28436.29it/s]
100%|██████████| 3060/3060 [00:00<00:00, 58030.86it/s]


In [0]:
from sklearn.linear_model import LogisticRegression

In [32]:
logreg_elmo = LogisticRegression(random_state=42)
logreg_elmo.fit(X_elmo, y_elmo)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [33]:
elmo_pred = supervised.predict_probs(elmo_test, logreg_elmo)

100%|██████████| 760/760 [00:00<00:00, 7979.55it/s]


In [34]:
accuracy_score(y_test, elmo_pred)

0.37105263157894736

**Siamese network**

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [0]:
np.random.seed(42)
idx = np.random.choice(list(range(len(elmo_embs))), 
                       size=len(elmo_embs), replace=False)

elmo_train, y_train = np.array(elmo_embs)[idx[:2300]], np.array(y)[idx[:2300]]
elmo_test, y_test = np.array(elmo_embs)[idx[2300:]], np.array(y)[idx[2300:]]

In [0]:
X, y = siamese.prepare_for_siamese(elmo_train)
X = [(torch.from_numpy(i[0]), torch.from_numpy(i[1])) for i in X]

In [0]:
# train and validation
np.random.seed(42)
idx_train = np.random.choice(list(range(len(X))), 
                       size=len(X), replace=False)

train_x, train_y = [X[i] for i in idx_train[:7400]], np.array(y)[idx_train[:7400]]
valid_x, valid_y = [X[i] for i in idx_train[7400:]], np.array(y)[idx_train[7400:]]

In [18]:
pretrained_model = siamese.SiameseNet()
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.00002)
pretrained_model.cuda()

SiameseNet(
  (question): Sequential(
    (0): Linear(in_features=1024, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): ReLU()
  )
  (answer): Sequential(
    (0): Linear(in_features=1024, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): ReLU()
  )
  (classifier): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=1, bias=True)
  )
)

In [0]:
pretrained_model.load_state_dict(torch.load('models/model.pt'))

In [0]:
elmo_test_pred = siamese.siamese_pred(elmo_test, pretrained_model)

In [22]:
accuracy_score(elmo_test_pred, y_test)

0.3473684210526316

In [0]:
model = siamese.SiameseNet()
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.00002)
model.cuda()

In [24]:
epochs = 25
valid_loss_min = np.Inf
train_on_gpu = True

for e in range(epochs):
    train_loss = 0
    valid_loss = 0
    model.train()
    for q, a, labels in siamese.get_batches(train_x, train_y, batch_size=128):
        if train_on_gpu:
            q = q.cuda()
            a = a.cuda()
            labels = labels.cuda()
        log_ps = model(q, a)
        # print(labels)
        loss = criterion(log_ps.squeeze(), labels.float())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * q.size(0)

    model.eval()
    for q, a, labels in siamese.get_batches(valid_x, valid_y, batch_size=128):
        if train_on_gpu:
            q = q.cuda()
            a = a.cuda()
            labels = labels.cuda()
        log_ps = model(q, a)
        loss = criterion(log_ps.squeeze(), labels.float())
        valid_loss += loss * q.size(0)
    
    train_loss = train_loss/(len(train_x))
    valid_loss = valid_loss/(len(valid_x))
    
    print('Epoch: {} \tTraining Loss: {:.4f} \tValidation Loss: {:.4f}'.format(
        e+1, 
        train_loss,
        valid_loss
        ))
    
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model.state_dict(), 'siamese_model.pt')
        valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 0.6833 	Validation Loss: 0.6523
Validation loss decreased (inf --> 0.652336).  Saving model ...
Epoch: 2 	Training Loss: 0.5937 	Validation Loss: 0.5554
Validation loss decreased (0.652336 --> 0.555372).  Saving model ...
Epoch: 3 	Training Loss: 0.5577 	Validation Loss: 0.5535
Validation loss decreased (0.555372 --> 0.553527).  Saving model ...
Epoch: 4 	Training Loss: 0.5557 	Validation Loss: 0.5531
Validation loss decreased (0.553527 --> 0.553117).  Saving model ...
Epoch: 5 	Training Loss: 0.5547 	Validation Loss: 0.5531
Validation loss decreased (0.553117 --> 0.553069).  Saving model ...
Epoch: 6 	Training Loss: 0.5538 	Validation Loss: 0.5531
Epoch: 7 	Training Loss: 0.5529 	Validation Loss: 0.5531
Epoch: 8 	Training Loss: 0.5520 	Validation Loss: 0.5532
Epoch: 9 	Training Loss: 0.5510 	Validation Loss: 0.5532
Epoch: 10 	Training Loss: 0.5498 	Validation Loss: 0.5532
Epoch: 11 	Training Loss: 0.5484 	Validation Loss: 0.5531
Epoch: 12 	Training Loss: 0.546

In [0]:
elmo_test_pred = siamese.siamese_pred(elmo_test, model)

In [26]:
accuracy_score(elmo_test_pred, y_test)

0.41578947368421054