# Assignment 8

Develop a model for 20 news groups dataset. Select 20% of data for test set.  

Use metric learning with siamese networks and triplet loss.   
Use KNN and LSH (`annoy` library) for final prediction after the network was trained.

! Remember, that LSH gives you a set of neighbor candidates, for which you have to calculate distances to choose top-k nearest neighbors. 

Your quality = accuracy score

In [1]:
from allennlp.modules.elmo import Elmo, batch_to_ids
import pandas as pd
import numpy as np
import gensim
from tqdm import tqdm_notebook

from sklearn import metrics
from sklearn.model_selection import train_test_split

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

options_file = "settings"
weight_file = "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"

elmo = Elmo(options_file, weight_file, 2, dropout=0)

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
from nltk.tokenize import word_tokenize

class CustomDataset:
    def __init__(self, data, mode='tuple'):
        self.elmo = elmo
        if mode == 'tuple':
            #self.data = [batch_to_ids([[i[0]]]) for i in data]
            self.data = []
            for i in tqdm_notebook(data):
                x = batch_to_ids([[x for x in word_tokenize(i[0]) if 1 < len(x) < 25][:400]])
                x = self.elmo(x)['elmo_representations']
                x = tt.cat(x, dim=-1)
                x = x.mean(dim=1).detach()
                self.data.append(x)
            #self.data = [[x for x in word_tokenize(i[0]) if 1 < len(x) < 25][:400] for i in data]
            self.target = np.array(list(i[1] for i in data))
            self.length = len(data)
    
    def generate_batch(self):
        anc = []
        pos = []
        neg = []
        
        for i in range(32):
            idx = np.random.choice(np.arange(self.length))
            text = self.data[idx]
            cls_ = self.target[idx]
            anchor = text

            idx = np.random.choice(np.arange(self.length))
            cls = self.target[idx]
            while cls != cls_: 
                idx = np.random.choice(np.arange(self.length))
                cls = self.target[idx]
            text = self.data[idx]
            positive = text

            idx = np.random.choice(np.arange(self.length))
            cls = self.target[idx]
            while cls == cls_: 
                idx = np.random.choice(np.arange(self.length))
                cls = self.target[idx]
            text = self.data[idx]
            negative = text
            
            anc.append(anchor)
            pos.append(positive)
            neg.append(negative)
        
        return tt.cat(anc), tt.cat(pos), tt.cat(neg)
    
    def __iter__(self):
        for i in range(1000):
            yield self.generate_batch()
        return self
    
    def __len__(self):
        return self.length

In [4]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

newsgroups_train = [(newsgroups_train.data[i], newsgroups_train.target[i],) for i in range(len(newsgroups_train.data))]

train, valid = train_test_split(newsgroups_train)

train = CustomDataset(train[:2000], mode='tuple')
valid = CustomDataset(valid[:2000], mode='tuple')

newsgroups_test = fetch_20newsgroups(subset='test')
newsgroups_test = [(newsgroups_test.data[i], newsgroups_test.target[i],) for i in range(len(newsgroups_test.data))]
test = CustomDataset(newsgroups_test[:1000], mode='tuple')

HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [8]:
from tqdm import tqdm_notebook

def _train_epoch(model, iterator, optimizer, curr_epoch):

    model.train()

    running_loss = 0

    n_batches = 1000
    iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)
    
    for i, batch in enumerate(iterator):
        optimizer.zero_grad()

        loss = model(batch)
        loss.backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator):
    model.eval()
    epoch_loss = 0

    n_batches = 100
    with tt.no_grad():
        for batch in iterator:
            loss = model(batch)
            epoch_loss += loss.data.item()

    return epoch_loss / n_batches


def nn_train(model, train_iterator, valid_iterator, optimizer, n_epochs=100,
          scheduler=None, early_stopping=0):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, epoch)
        valid_loss = _test_epoch(model, valid_iterator)

        valid_loss = valid_loss
        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)

In [37]:
class MyModel(nn.Module):
    
    def __init__(self, elmo, criterion):
        super(MyModel, self).__init__()
        self.criterion = criterion
        self.fc = nn.Linear(256*2, 128)
        self.criterion = nn.TripletMarginLoss()
            
    def triplet_loss(self, anchor_embed, pos_embed, neg_embed):
        w = F.cosine_similarity(anchor_embed, neg_embed) - F.cosine_similarity(anchor_embed, pos_embed)
        return w.mean()
        
    def branch(self, x):
        x = self.fc(x)
        return x
        
    def forward(self, batch):
        anchor = self.branch(batch[0])
        pos = self.branch(batch[1])
        neg = self.branch(batch[2])        
        return self.criterion(anchor, pos, neg)



model = MyModel(elmo, nn.BCEWithLogitsLoss())

optimizer = optim.Adam(model.parameters())

#nn_train(model, train, valid, optimizer, n_epochs=3)

In [38]:
nn_train(model, train, valid, optimizer, n_epochs=25)

HBox(children=(IntProgress(value=0, description='epoch 0', max=1000, style=ProgressStyle(description_width='in…


validation loss 3.47162


HBox(children=(IntProgress(value=0, description='epoch 1', max=1000, style=ProgressStyle(description_width='in…


validation loss 3.41233


HBox(children=(IntProgress(value=0, description='epoch 2', max=1000, style=ProgressStyle(description_width='in…


validation loss 3.52467


HBox(children=(IntProgress(value=0, description='epoch 3', max=1000, style=ProgressStyle(description_width='in…


validation loss 3.45393


HBox(children=(IntProgress(value=0, description='epoch 4', max=1000, style=ProgressStyle(description_width='in…


validation loss 3.38365


HBox(children=(IntProgress(value=0, description='epoch 5', max=1000, style=ProgressStyle(description_width='in…


validation loss 3.51645


HBox(children=(IntProgress(value=0, description='epoch 6', max=1000, style=ProgressStyle(description_width='in…


validation loss 3.56808


HBox(children=(IntProgress(value=0, description='epoch 7', max=1000, style=ProgressStyle(description_width='in…


validation loss 3.58718


HBox(children=(IntProgress(value=0, description='epoch 8', max=1000, style=ProgressStyle(description_width='in…


validation loss 3.55540


HBox(children=(IntProgress(value=0, description='epoch 9', max=1000, style=ProgressStyle(description_width='in…


validation loss 3.75273


HBox(children=(IntProgress(value=0, description='epoch 10', max=1000, style=ProgressStyle(description_width='i…


validation loss 3.67759


HBox(children=(IntProgress(value=0, description='epoch 11', max=1000, style=ProgressStyle(description_width='i…


validation loss 3.68003


HBox(children=(IntProgress(value=0, description='epoch 12', max=1000, style=ProgressStyle(description_width='i…


validation loss 3.65260


HBox(children=(IntProgress(value=0, description='epoch 13', max=1000, style=ProgressStyle(description_width='i…


validation loss 3.74176


HBox(children=(IntProgress(value=0, description='epoch 14', max=1000, style=ProgressStyle(description_width='i…


validation loss 3.70683


HBox(children=(IntProgress(value=0, description='epoch 15', max=1000, style=ProgressStyle(description_width='i…


validation loss 3.68193


HBox(children=(IntProgress(value=0, description='epoch 16', max=1000, style=ProgressStyle(description_width='i…


validation loss 3.79084


HBox(children=(IntProgress(value=0, description='epoch 17', max=1000, style=ProgressStyle(description_width='i…


validation loss 3.75067


HBox(children=(IntProgress(value=0, description='epoch 18', max=1000, style=ProgressStyle(description_width='i…


validation loss 3.82843


HBox(children=(IntProgress(value=0, description='epoch 19', max=1000, style=ProgressStyle(description_width='i…


validation loss 3.85510


HBox(children=(IntProgress(value=0, description='epoch 20', max=1000, style=ProgressStyle(description_width='i…


validation loss 3.90259


HBox(children=(IntProgress(value=0, description='epoch 21', max=1000, style=ProgressStyle(description_width='i…


validation loss 3.86926


HBox(children=(IntProgress(value=0, description='epoch 22', max=1000, style=ProgressStyle(description_width='i…


validation loss 3.92926


HBox(children=(IntProgress(value=0, description='epoch 23', max=1000, style=ProgressStyle(description_width='i…


validation loss 4.03010


HBox(children=(IntProgress(value=0, description='epoch 24', max=1000, style=ProgressStyle(description_width='i…


validation loss 3.95788


In [7]:
batch_to_ids([train.data[17]]).shape

torch.Size([1, 195, 50])

In [8]:
w = batch_to_ids([train.data[0]])
w.shape

torch.Size([1, 159, 50])

In [9]:
model.branch(w).shape

torch.Size([1, 128])

In [39]:
from annoy import AnnoyIndex

a = AnnoyIndex(128)
model.eval()
for key, i in tqdm_notebook(enumerate(train.data), total=train.length):
    a.add_item(key, model.branch(i)[0])
a.build(-1)

HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))




True

# Accuracy

In [40]:
np.zeros(20)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])

In [41]:
true = np.zeros(20)
false = np.zeros(20)

In [42]:
def most_popular(lst):
    lst = list(lst)
    return max(set(lst), key=lst.count)

In [43]:
model.eval()
for key, i in tqdm_notebook(enumerate(test.data), total=test.length):
    w = model.branch(i)[0]
    v = a.get_nns_by_vector(w, 25)
    answer = most_popular(train.target[v])
    true_value = test.target[key]
    if true_value == answer:
        true[true_value] += 1
    else:
        false[true_value] += 1

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




До обучения

In [36]:
true / (true + false), sum(true)/(sum(true)+sum(false))

(array([0.35135135, 0.4       , 0.42307692, 0.35185185, 0.17741935,
        0.15      , 0.48717949, 0.67924528, 0.62295082, 0.67241379,
        0.79710145, 0.39534884, 0.29166667, 0.62222222, 0.54545455,
        0.58928571, 0.53191489, 0.72      , 0.21052632, 0.03030303]), 0.469)

После

In [44]:
true / (true + false), sum(true)/(sum(true)+sum(false))

(array([0.40540541, 0.4       , 0.46153846, 0.25925926, 0.24193548,
        0.33333333, 0.58974359, 0.71698113, 0.59016393, 0.81034483,
        0.72463768, 0.51162791, 0.27083333, 0.84444444, 0.65454545,
        0.64285714, 0.5106383 , 0.74      , 0.28947368, 0.06060606]), 0.517)