# YELP Siamese: A little example of a LSTM-Siamese for Text Similarity

Positive ratings are similar to each other! Negative too ;)




#### Notes

- PackedSequence: https://suzyahyah.github.io/pytorch/2019/07/01/DataLoader-Pad-Pack-Sequence.html

In [1]:
import os
import re
import random
import pickle
import pandas as pd
from bs4 import BeautifulSoup
import itertools
import more_itertools
import numpy as np
import unicodedata
import string

In [44]:
import nltk
import spacy
import gensim
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import classification_report

In [3]:

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim import Adam, SGD
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence, pack_padded_sequence
import logging
logging.basicConfig(level=logging.INFO)

In [4]:
batch_size = 10
epochs = 1
learning_rate = 2e-5
max_seq_length = 64

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

# Load data and preprocess it!

In [6]:
## The dataset is taken from https://github.com/justmarkham/DAT7/blob/master/data/yelp.csv 
df = pd.read_csv('/Volumes/data/repo/data/yelp.csv')
df.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [7]:
# Make it a 2D classification!
df['positive'] = 0
df['negative'] = 0

for idx, row in df.iterrows():
    df.at[idx, 'positive'] = 1 if row['stars'] >= 3 else 0
    df.at[idx, 'negative'] = 1 if row['stars'] <= 2 else 0
    
df.sample(n=3)

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny,positive,negative
4481,noLH_u4MJzfXYYHqcByjnA,2010-07-01,ja8Up9t41UlbF2eXNSuabA,4,We just moved to town and were looking for a p...,review,SgjR6pUm_mpmX0f-XH1Ztw,1,2,0,1,0
3840,W34RE9avBNLfkmiiZwPKiA,2009-06-24,K7N-DrWgKiYySwHfP3sf8A,5,Agree with the previous review. Something nice...,review,UPtysDF6cUDUxq2KY-6Dcg,1,0,1,1,0
9946,T1EfT96sCrn_UUWQMhzB5w,2011-10-31,UMjqZj-SJ25VnLyB3EV4ug,3,A coyote ugly knock off but a cool place. The ...,review,4E_nPWw89FLFHdNsEgMH-g,1,1,4,1,0


In [105]:
# Train/test split
split_at = int(0.8 * len(df))

df = df.sample(frac=1).reset_index(drop=True)

train_df = df[:split_at]
test_df = df[split_at:]

print(f'Train: {len(train_df)}')
print(f'Test: {len(test_df)}')


Train: 8000
Test: 2000


In [9]:
# Print some example review text
t = df['text'][0]
print(t)

Very good food! We ate some shrimp tacos and macaroni and cheese. They split our tacos up for us and didn't charge for it...the portions were great too!

Our server was friendly as well. Would definitely return!


In [10]:
# Spacy is used to tokenize the review texts
nlp = spacy.load("en_core_web_sm", disable=['tagger', 'ner'])

In [11]:
# We use Glove word embeddings
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('/Volumes/data/repo/data/glove.6B/glove.6B.200d.w2vformat.txt')

INFO:gensim.models.utils_any2vec:loading projection weights from /Volumes/data/repo/data/glove.6B/glove.6B.200d.w2vformat.txt
INFO:gensim.models.utils_any2vec:loaded (400000, 200) matrix from /Volumes/data/repo/data/glove.6B/glove.6B.200d.w2vformat.txt


In [21]:
# Data loader from df

idx2token_ids = {}
idx2len = {}
idx2y = {}

for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
    # Tokenize text and convert to word2vec idxs
    doc = nlp(row['text'])
    token_ids = []
    for sent in doc.sents:
        for token in sent:
            word = token.norm_
            if word in w2v_model.vocab:
                token_ids.append(w2v_model.vocab[word].index)  # TODO: Add special index for UNKNOWN + PADDING
        break  # For now we only use the first sentence

    #train_token_ids.append(torch.tensor(token_ids))
    #train_y.append([row['positive']]) # , row['negative']
    length = min(len(token_ids), max_seq_length)  # not larger than max length
    if length > 2:
        idx2token_ids[idx] = torch.tensor(token_ids)
        idx2y[idx] = row['positive']
        idx2len[idx] = length
        

HBox(children=(IntProgress(value=0, max=8000), HTML(value='')))




In [22]:
# For siamese nets we need pair-wise data

n_pairs = 10  # n * n would be too much - keep it at n * 10

idxs = list(idx2y.keys())

train_token_ids_a = []
train_token_len_a = []

train_token_ids_b = []
train_token_len_b = []

train_y = []

for a_idx in idx2y:
    for b_idx in random.sample(idxs, n_pairs):
        if a_idx != b_idx:
            if idx2y[a_idx] == idx2y[b_idx]:
                train_y.append(1)  # similar
            else:
                train_y.append(0)  # not similar
                
            train_token_ids_a.append(idx2token_ids[a_idx])
            train_token_ids_b.append(idx2token_ids[b_idx])

            train_token_len_a.append(idx2len[a_idx])
            train_token_len_b.append(idx2len[b_idx])
            
            
            
print(f'Training pairs: {len(train_y):,}')

Training pairs: 77,246


In [23]:
# Print some examples
for i in range(3):
    print(train_token_ids_a[i])
    print(train_token_ids_b[i])
    print(train_y[i])
    print('----')

tensor([191, 219, 565, 805])
tensor([  41,   33,   51,    4,   13, 2029, 5447,    2])
1
----
tensor([191, 219, 565, 805])
tensor([12910,   285,     3,     0,   795,  1152,    13, 68229,    11,   793,
         9119,     2])
1
----
tensor([191, 219, 565, 805])
tensor([  61,   41,  809,   77, 1095,   10, 3330,   22,   37, 2551,   22,    0,
         156,    3,  478,    1,   41, 5572,   20,  100,  181,   12,   41,  116,
           7, 3611, 1812,    4, 2432,  645,  168,    2])
1
----


In [25]:
assert len(train_token_ids_a) == len(train_token_ids_b) 
assert len(train_token_ids_a) == len(train_y)
assert len(train_token_ids_a) == len(train_token_len_b)
assert len(train_token_len_a) == len(train_token_len_b)

len(train_token_ids_a)

77246

In [27]:
# Add padding + limit to max length

# , padding_value=0
train_idxs_pad_a = pad_sequence(train_token_ids_a, batch_first=True)[:,:max_seq_length]
train_idxs_pad_b = pad_sequence(train_token_ids_b, batch_first=True)[:,:max_seq_length]

train_idxs_pad_a.size()

torch.Size([77246, 64])

In [85]:
# Build data loader
train_dataset = TensorDataset(
    train_idxs_pad_a, torch.tensor(train_token_len_a),
    train_idxs_pad_b, torch.tensor(train_token_len_b),
    torch.tensor(train_y)
)  #  dtype=torch.float

train_sampler = RandomSampler(train_dataset)
#train_sampler = SequentialSampler(train_dataset)  # only to debugging (real training should be random)

train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size) #, collate_fn=pad_collate)

# Define model

In [101]:
class SiameseSentenceLSTM(nn.Module):
    def __init__(self, embedding_layer, embed_dim, hidden_dim, batch_size, out_dim):
        super().__init__()

        self.embed_dim = embed_dim        
        self.embed = embedding_layer
        
        self.n_layers = 1
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        self.bidirectional = False
        self.directions = 2 if self.bidirectional else 1
        
        self.lstm = nn.LSTM(
            input_size=self.embed_dim, 
            hidden_size=self.hidden_dim, 
            num_layers=1, 
            bidirectional=self.bidirectional,
            batch_first=True
        )

        self.input_dim = int(5 * self.directions * self.hidden_dim)
        self.mlp_dim = int(self.input_dim/2)
        
        self.out_dim = out_dim  # depends on how many labels we have (binary => dim = 1)

        self.classifier = nn.Sequential(  # two layers (maybe add more?)
            nn.Linear(self.input_dim, self.mlp_dim),
            nn.Linear(self.mlp_dim, self.out_dim)
        )
        
        self.prob = nn.Sigmoid()  # or softmax?
        
    def init_hidden_cell(self):
        """
        h_0 of shape (num_layers * num_directions, batch, hidden_size): tensor containing the initial hidden state for each element in the batch. If the LSTM is bidirectional, num_directions should be 2, else it should be 1.

        c_0 of shape (num_layers * num_directions, batch, hidden_size): tensor containing the initial cell state for each element in the batch.

        If (h_0, c_0) is not provided, both h_0 and c_0 default to zero.

        """
            
        # zero or random initialization
        # https://r2rt.com/non-zero-initial-states-for-recurrent-neural-networks.html
        
        rand_hidden = Variable(torch.randn(self.directions * self.n_layers, self.batch_size, self.hidden_dim))
        rand_cell = Variable(torch.randn(self.directions * self.n_layers, self.batch_size, self.hidden_dim))
        
        return rand_hidden, rand_cell
    
    def forward(self, sent1, len1, sent2, len2):

        # init hidden, cell
        h1, c1 = self.init_hidden_cell()
        h2, c2 = self.init_hidden_cell()

        # input one by one
        #for i in range(len(s1)):
        #    v1, h1, c1 = self.encoder(s1[i], h1, c1)
        
        # batch-wise: pack padded sequences (required for RNNs, other padding zeros will be evaluated)
        packed_output1, (h1, c1) = self.lstm(
            pack_padded_sequence(self.embed(sent1), len1, batch_first=True, enforce_sorted=False),
            (h1, c1)
        )
        
        packed_output2, (h2, c2) = self.lstm(
            pack_padded_sequence(self.embed(sent2), len2, batch_first=True, enforce_sorted=False),
            (h2, c2)
        )
        
        # unpack sequences
        output_padded1, output_lengths1 = pad_packed_sequence(packed_output1, batch_first=True)
        output_padded2, output_lengths2 = pad_packed_sequence(packed_output2, batch_first=True)
        
        # last word output
        #v1 = output_padded1[:,-1]
        #v2 = output_padded2[:,-1]
        
        # last hidden state (dim = batch_size x hidden_dim)
        v1 = h1[-1]
        v2 = h2[-1]
        

        # utilize these two encoded vectors
        #features = torch.cat((v1,torch.abs(v1 - v2),v2,v1*v2, (v1+v2)/2), 2)
        features = torch.cat((v1,torch.abs(v1 - v2),v2,v1*v2, (v1+v2)/2), dim=1)
        
        # run classical MLP on combined vectors
        output = self.classifier(features)
        
        # convert into 0-1 interval
        prob_out = self.prob(output)

        return prob_out



In [102]:
w2v_weights = torch.FloatTensor(w2v_model.vectors)  # load Glove embeddings
    
# Initialize model with our settings
model = SiameseSentenceLSTM(
    embedding_layer=nn.Embedding.from_pretrained(w2v_weights),
    embed_dim=w2v_model.vector_size,
    hidden_dim=100,
    batch_size=batch_size,
    out_dim=1,
)

# Training

In [103]:
epochs = 3

In [104]:
optimizer = Adam(model.parameters(), lr=learning_rate)  # maybe try different optimizer and loss function?
criterion = nn.BCEWithLogitsLoss()

# actual training loop
for epoch_num in range(epochs):
    train_loss = 0

    # iterate over each batch
    for step_num, batch_data in enumerate(tqdm(train_dataloader, desc=f'Epoch: {epoch_num + 1}/{epochs}')):
        
        # switch model to training mode, clear gradient accumulators
        model.train()
        optimizer.zero_grad() 
        
        # in case GPU is enabled, this sends data to GPU
        sent1, len1, sent2, len2, y = tuple(t.to(device) for t in batch_data) 
        
        # predict
        probas = model(sent1, len1, sent2, len2)  

        # for CrossEntropyLoss you must use .squeeze()
        batch_loss = criterion(probas.squeeze(), y.float())  # loss function
        train_loss += batch_loss.item()

        # backpropagate and update optimizer learning rate
        batch_loss.backward()
        optimizer.step()

    print(f'\r{epoch_num} loss: {train_loss / (step_num + 1)}')

HBox(children=(IntProgress(value=0, description='Epoch: 1/3', max=7725, style=ProgressStyle(description_width=…

0 loss: 0.5849032521170706


HBox(children=(IntProgress(value=0, description='Epoch: 2/3', max=7725, style=ProgressStyle(description_width=…

1 loss: 0.5738093224891181


HBox(children=(IntProgress(value=0, description='Epoch: 3/3', max=7725, style=ProgressStyle(description_width=…

2 loss: 0.5689087561534832


In [36]:
# TODO visualize training progress

4497.900374084711

# Test

In [106]:
model.eval()
all_y = None
all_probas = None

with torch.no_grad():
    for step_num, batch_data in enumerate(tqdm(train_dataloader, desc=f'Evaluate')):  # evaluate on TRAIN for debugging!
        sent1, len1, sent2, len2, y = tuple(t.to(device) for t in batch_data) 
        probas = model(sent1, len1, sent2, len2)  # predict
        
        # back to CPU, back to numpy
        probas = probas.cpu().detach().squeeze().numpy()
        y = y.cpu().detach().numpy()
                
        # append
        all_y = y if all_y is None else np.hstack((all_y, y))
        all_probas = probas if all_probas is None else np.hstack((all_probas, probas))
        
        #if len(all_y) > 1000:
        #    break

HBox(children=(IntProgress(value=0, description='Evaluate', max=7725, style=ProgressStyle(description_width='i…

In [107]:

t_max = 0.5  # classification threshold

print(classification_report(all_y, np.where(all_probas > t_max, 1, 0), target_names=['similar', 'not similar']))

              precision    recall  f1-score   support

     similar       0.50      0.30      0.37     21223
 not similar       0.77      0.88      0.82     56023

    accuracy                           0.72     77246
   macro avg       0.63      0.59      0.60     77246
weighted avg       0.69      0.72      0.70     77246



In [108]:
probas

array([0.9933501 , 0.27233917, 0.63532484, 0.98494476, 0.89609414,
       0.97353697], dtype=float32)