# Assignment 8

Develop a model for 20 news groups dataset from scikit-learn. Select 20% of data for test set.  

Develop metric learning model with siamese network [3 points] and triplet loss [3 points] (from seminar). 
Use KNN and LSH (any library for approximate nearest neighbor search) for final prediction after the network was trained. [2 points]

! Remember, that LSH gives you a set of neighbor candidates, for which you have to calculate distances to choose top-k nearest neighbors. 

Your quality metric = accuracy score [2 points if acc > 0.8 ]

In [0]:
import numpy as np
import pandas as pd
import nltk

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torchtext import data

from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook

In [0]:
from sklearn.datasets import fetch_20newsgroups
all_data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [0]:
len(all_data.data), len(all_data.target)

(11314, 11314)

In [0]:
all_data.data[:3]

['I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.',
 "A fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't an

In [0]:
all_data.target[:3]

array([7, 4, 4])

In [0]:
import pandas as pd

d = {'text': all_data.data, 'target': all_data.target}
df = pd.DataFrame(data = d)
df.head()

Unnamed: 0,text,target
0,I was wondering if anyone out there could enli...,7
1,A fair number of brave souls who upgraded thei...,4
2,"well folks, my mac plus finally gave up the gh...",4
3,\nDo you have Weitek's address/phone number? ...,1
4,"From article <C5owCB.n3p@world.std.com>, by to...",14


In [0]:
all_len = []
for row in df.text:
  text_len = len(row)
  all_len.append(text_len)

In [0]:
df['text_len'] = all_len

In [0]:
df.head()

Unnamed: 0,text,target,text_len
0,I was wondering if anyone out there could enli...,7,475
1,A fair number of brave souls who upgraded thei...,4,530
2,"well folks, my mac plus finally gave up the gh...",4,1659
3,\nDo you have Weitek's address/phone number? ...,1,95
4,"From article <C5owCB.n3p@world.std.com>, by to...",14,448


In [0]:
new_df = df[df.text_len < 1000].reset_index(drop=True)
new_df.drop('text_len', axis='columns', inplace=True)
new_df['target'] = new_df['target'].apply(float)
new_df.head()

Unnamed: 0,text,target
0,I was wondering if anyone out there could enli...,7.0
1,A fair number of brave souls who upgraded thei...,4.0
2,\nDo you have Weitek's address/phone number? ...,1.0
3,"From article <C5owCB.n3p@world.std.com>, by to...",14.0
4,\n\n\n\n\nOf course. The term must be rigidly...,16.0


In [0]:
pip install sentence_transformers

In [0]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-wikipedia-sections-mean-tokens') 
#модель была выбрана из представленных на сайте https://github.com/UKPLab/sentence-transformers

100%|██████████| 405M/405M [00:45<00:00, 8.97MB/s]


In [0]:
sentences = ['This framework generates embeddings for each input sentence. Sentences are passed as a list of string. The quick brown fox jumps over the lazy dog.']
sentence_embeddings = model.encode(sentences)

In [0]:
len(sentence_embeddings), sentence_embeddings[0].shape

(1, (768,))

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import os
os.chdir('gdrive/My Drive/Colab Notebooks')

In [0]:
embs = model.encode(new_df.text, show_progress_bar=True)

Batches: 100%|██████████| 1067/1067 [23:20<00:00,  1.72s/it]


In [0]:
from sklearn.externals import joblib
joblib.dump(embs, "embs.pkl")

['embs.pkl']

In [0]:
from sklearn.externals import joblib
embs = joblib.load("embs.pkl")

In [0]:
len(embs)

8535

In [0]:
new_df['embs'] = embs

In [0]:
new_df.head()

Unnamed: 0,text,target,embs
0,I was wondering if anyone out there could enli...,7.0,"[0.027451199, 0.36307752, -0.025275145, 0.0261..."
1,A fair number of brave souls who upgraded thei...,4.0,"[0.011982416, 0.29064688, 0.08987675, 0.064537..."
2,\nDo you have Weitek's address/phone number? ...,1.0,"[0.05981747, 0.28420597, 0.032867167, 0.050284..."
3,"From article <C5owCB.n3p@world.std.com>, by to...",14.0,"[-0.0072508887, 0.39486945, -0.022240369, 0.01..."
4,\n\n\n\n\nOf course. The term must be rigidly...,16.0,"[0.030753314, 0.37670717, -0.072004944, 0.0240..."


In [0]:
def pos_neg(row, df):
  pos = np.random.choice(df[df['target'] == row['target']]['embs'])
  neg = np.random.choice(df[df['target'] != row['target']]['embs'])

  return pos, neg

In [0]:
pos_vars = []
neg_vars = []

for i, row in new_df.iterrows():
  pos, neg = pos_neg(row, new_df)
  pos_vars.append(pos)
  neg_vars.append(neg)

In [0]:
len(pos_vars), len(neg_vars)

(8535, 8535)

In [0]:
len(embs[0]), len(pos_vars[0])

(768, 768)

In [0]:
new_df["pos"] = pos_vars
new_df["neg"] = neg_vars
new_df.head()

Unnamed: 0,text,target,embs,pos,neg
0,I was wondering if anyone out there could enli...,7.0,"[0.027451199, 0.36307752, -0.025275145, 0.0261...","[0.061900675, 0.37596345, 0.008991978, 0.08076...","[-0.0075753694, 0.36655474, -0.013695048, 0.01..."
1,A fair number of brave souls who upgraded thei...,4.0,"[0.011982416, 0.29064688, 0.08987675, 0.064537...","[0.028080646, 0.34674957, 0.013658325, 0.04085...","[0.009518413, 0.34843457, 0.02964886, 0.017016..."
2,\nDo you have Weitek's address/phone number? ...,1.0,"[0.05981747, 0.28420597, 0.032867167, 0.050284...","[-0.0045779133, 0.34217203, 0.017413076, 0.035...","[0.038531464, 0.4074713, 0.011913281, 0.018197..."
3,"From article <C5owCB.n3p@world.std.com>, by to...",14.0,"[-0.0072508887, 0.39486945, -0.022240369, 0.01...","[0.055907134, 0.36407945, -0.021776598, 0.0401...","[0.008503525, 0.30474368, -0.01475872, -0.0064..."
4,\n\n\n\n\nOf course. The term must be rigidly...,16.0,"[0.030753314, 0.37670717, -0.072004944, 0.0240...","[0.034953512, 0.37414455, -0.07043642, 0.05281...","[0.0008630912, 0.3692007, -0.01976406, 0.00628..."


In [0]:
df_train, df_test = train_test_split(new_df, test_size=0.2, random_state=42, shuffle=True)
df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=42, shuffle=True)

In [0]:
print('Train shape = ', df_train.shape)
print('Validation shape = ', df_val.shape)
print('Test shape = ', df_test.shape)

Train shape =  (6145, 5)
Validation shape =  (683, 5)
Test shape =  (1707, 5)


In [0]:
def triples(df):
  anch = list(df.embs)
  pos = list(df.pos)
  neg = list(df.neg)
  y = list(df.target)

  return tt.FloatTensor(anch), tt.FloatTensor(pos), tt.FloatTensor(neg), tt.FloatTensor(y)

In [0]:
anch_train, pos_train, neg_train, y_train = triples(df_train)
anch_val, pos_val, neg_val, y_val = triples(df_val)
anch_test, pos_test, neg_test, y_test = triples(df_test)

In [0]:
batch_size = 32

train_loader = DataLoader(TensorDataset(anch_train, pos_train, neg_train,  y_train), batch_size=batch_size)
valid_loader = DataLoader(TensorDataset(anch_val, pos_val, neg_val,  y_val), batch_size=batch_size)
test_loader = DataLoader(TensorDataset(anch_test, pos_test, neg_test,  y_test), batch_size=batch_size)

In [0]:
def _train_epoch(model, iterator, optimizer, curr_epoch):

    model.train()

    running_loss = 0

    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()

        loss = model(batch)
        loss.backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

In [0]:
def _test_epoch(model, iterator):
    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    with tt.no_grad():
        for batch in iterator:
            loss = model(batch)
            epoch_loss += loss.data.item()

    return epoch_loss / n_batches

In [0]:
def nn_train(model, train_iterator, valid_iterator, optimizer, n_epochs=10,
          scheduler=None, early_stopping=0):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, epoch)
        valid_loss = _test_epoch(model, valid_iterator)

        valid_loss = valid_loss
        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)

In [0]:
def triplet_loss(anchor_embed, pos_embed, neg_embed, margin = 0.1):
  res = F.cosine_similarity(anchor_embed, neg_embed) - F.cosine_similarity(anchor_embed, pos_embed) + margin
  return tt.mean(res)

In [0]:
class Tripletnet(nn.Module):
    def __init__(self):
        super(Tripletnet, self).__init__()
        self.fc = nn.Linear(768, 128)
        
    def branch(self, x):
        x = self.fc(x)
        return x

    def forward(self, batch):

        anchor, pos, neg = batch[0], batch[1], batch[2]

        anchor = self.branch(anchor)
        pos = self.branch(pos)
        neg = self.branch(neg)
        
        return triplet_loss(anchor, pos, neg)

In [0]:
model = Tripletnet()
optimizer = optim.Adam(model.parameters())

In [0]:
nn_train(model, train_loader, test_loader, optimizer, n_epochs=100)

In [0]:
#from sklearn.externals import joblib
#joblib.dump(model, "trip_model.pkl")

In [0]:
#from sklearn.externals import joblib
#model = joblib.load("trip_model.pkl")

Predictions

In [0]:
pip install annoy

In [0]:
from annoy import AnnoyIndex

a = AnnoyIndex(128, 'angular')
for i in range(anch_train.size()[0]):
    v = model.branch(anch_train[i])
    a.add_item(i, v)
a.build(1000)

True

In [0]:
from sklearn.neighbors import KDTree
from scipy.spatial import cKDTree

preds = []

for i, vec in tqdm(enumerate(anch_val)):

    top = a.get_nns_by_vector(model.branch(vec), 1500)
    top_vecs = np.take(anch_train, np.array(top), axis=0).tolist()
    
    top_y = np.take(y_train, np.array(top)).tolist()
    t = cKDTree(top_vecs).query(vec, k=1)[1]
    pred = top_vecs[t]
    pred_idx = top_vecs.index(pred)

    preds.append(top_y[pred_idx])

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [0]:
len(y_val), len(preds)

(683, 683)

In [0]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val, preds)

0.4670571010248902