In [None]:
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

import re

import pandas as pd

import time
import math

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib import style

import pickle

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

from tqdm import tqdm

import math

# Introduction

This notebook contains a model I made from scratch to evaluate if a keyword(short sentence or a word) and a sentence( a long  sentence or a paragraph) is a match. I used two encoder RNNs, one for the keyword and one for the description, plus a Attention-like mechanism, but not as complicated as Attention.(see the model part for more detail)

# Define classes and functions

### Tool classes

In [None]:
style.use("ggplot")
plt.switch_backend('agg')
def showPlot(points):
    %matplotlib inline
    plt.plot(points)

In [None]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

Geting a list of words from a sentence. This function is typically useful for ecommerce sites that use chinese as their main language. Note that one can modify this part according to her needs.

In [None]:
def get_word_list(s1):
    regEx = re.compile('([\u4e00-\u9fa5]|[^a-zA-Z0-9_-]+)') 
    res = re.compile(r"([\u4e00-\u9fa5])")
    p1 = regEx.split(str(s1).lower())
    str1_list = []
    for stri in p1:
        
        if res.split(stri) == None:
            str1_list.append(stri)
        else:
            ret = res.split(stri)
            for ch in ret:
                str1_list.append(ch)

    list_word1 = [w for w in str1_list if len(w) != 0] 

    return  list_word1

### Dataset classes

I defined my own data class to store data in pandas

In [None]:
class Vocabulary:
    def __init__(self, freq_threshold):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_ch(text):
        return get_word_list(text)

    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 4

        for sentence in sentence_list:
            for word in self.tokenizer_ch(sentence):
                if word not in frequencies:
                    frequencies[word] = 1

                else:
                    frequencies[word] += 1

                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer_ch(text)

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]

In [None]:
class MyDataset(Dataset):
    def __init__(self, root_dir, captions_file, freq_threshold=5,vocab = None):
        self.root_dir = root_dir
        self.df = pd.read_csv(captions_file)

        # Get key, des, score columns
        self.key = self.df["pre_search_word"]
        self.desc = self.df["name"]
        self.score = self.df["y"]

        # Initialize vocabulary and build vocab
        if vocab == None:
            self.vocab = Vocabulary(freq_threshold)
            self.vocab.build_vocabulary(self.desc.tolist()+self.key.tolist())
        else:
            self.vocab = vocab
            

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        key = self.key[index]
        desc = self.desc[index]
        score = self.score[index]
        
        numericalized_key = [self.vocab.stoi["<SOS>"]]
        numericalized_key += self.vocab.numericalize(key)
        numericalized_key.append(self.vocab.stoi["<EOS>"])
        
        numericalized_desc = [self.vocab.stoi["<SOS>"]]
        numericalized_desc += self.vocab.numericalize(desc)
        numericalized_desc.append(self.vocab.stoi["<EOS>"])
        
        
        return torch.tensor(score), torch.tensor(numericalized_key), torch.tensor(numericalized_desc)

In [None]:
class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        scores = [item[0] for item in batch]
        scores = torch.tensor(scores)
        
        keys = [item[1] for item in batch]
        keys = pad_sequence(keys, batch_first=False, padding_value=self.pad_idx)
        
        descs = [item[2] for item in batch]
        descs = pad_sequence(descs, batch_first=False, padding_value=self.pad_idx)
        
        

        return scores, keys, descs

In [None]:
def get_loader( root_folder,
                file_name,
                batch_size=32,
                num_workers=8,
                shuffle=False,
                pin_memory=True,
                dataset = None,
                start_from = 0,
                vocab = None):
    if dataset == None:
        dataset = MyDataset(root_folder, file_name,vocab = vocab)
        
    if start_from != 0:
        dataset.df = dataset.df[start_from:].reset_index(drop=True)
        dataset.key = dataset.key[start_from:].reset_index(drop=True)
        dataset.desc = dataset.desc[start_from:].reset_index(drop=True)
        dataset.score = dataset.score[start_from:].reset_index(drop=True)


    pad_idx = dataset.vocab.stoi["<PAD>"]

    loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=pin_memory,
        collate_fn=MyCollate(pad_idx=pad_idx),
    )

    return loader, dataset


In [None]:
def get_or_load_data(batch_size = 1,from_csv_file = False,save = False,csv_file = "train.csv",start_from = 0, vocab = None):

    if from_csv_file == False:
        with open('dataloader.pkl', 'rb') as input:
            dataset = pickle.load(input)
            data_start = pickle.load(input)
        train_loader, dataset = get_loader("", csv_file ,batch_size = batch_size,dataset = dataset,start_from = start_from-data_start)
        
    else:
        train_loader, dataset = get_loader("", csv_file ,batch_size = batch_size,vocab = vocab)

    if save == True:
        with open('dataloader.pkl', 'wb') as output:
            pickle.dump(dataset, output, pickle.HIGHEST_PROTOCOL)
     
    total_batches = len(train_loader)
    vocab_size = len(dataset.vocab)
    
    return train_loader, dataset, total_batches, vocab_size

### Model

The idea is that: we want our model to mimic how we evaluate the relation, which is : first we remember the keyword and we see through the sentence to see if there is the pattern of our keyword in the sentence. 

The training steps:
    1. We feed our keyword into the EncoderK to encode the keyword to a vector.
    2. We concate the encoded keyword to every word of the sentence to make sure our model remember the keyword when reading the sentence (one can also use the Attention mechanism, but since our keyword are usually very short here, I don't think that is necessary)
    3. We feed the concated sentence to the EncoderD to make prediction.

In [None]:
class EncoderK(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers, p):
        super(EncoderK, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
    
        
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size, num_layers, bidirectional=True,dropout = p)

    def forward(self, x):
        # x: (seq_length, N) where N is batch size

        embedding = self.embedding(x)
        # embedding shape: (seq_length, N, embedding_size)

        encodedK, (hidden, cell) = self.rnn(embedding)
        #encoder_states:  (seq_len, N, 2* hidden_size)
        #hidden:  (num_layers * 2, N, hidden_size)
        #cell: (num_layers * 2, N, hidden_size)

        return encodedK[-1,:,:], hidden, cell

In [None]:
class EncoderD(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers, p):
        super(EncoderD, self).__init__()

        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size*3, hidden_size, num_layers, bidirectional=True,dropout = p)
        self.ff = nn.Linear(hidden_size*2,2)
        self.dropout = nn.Dropout(p)

    def forward(self, x, encodedK,hidden, cell):
        # x: (seq_length, N) where N is batch size
        #encodedK:  (1, N, 2* hidden_size)
        #hidden:  (num_layers * 2, N, hidden_size)
        #cell: (num_layers * 2, N, hidden_size)
        
        
        embedding = self.dropout(self.embedding(x))
        
        # embedding shape: (seq_length, N, hidden_size)
        encodedK = encodedK.repeat(embedding.shape[0],1,1)
        embedding = torch.cat((embedding, encodedK),2)
        

        out, _ = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)
        out = self.dropout(out[-1,:,:])
        out = self.ff(out)
        
        
        
        
        return out

In [None]:
def get_encoders(new = False,model_num = 1):
    
    plot_losses = []
    start_from = 0
    
    if new == True:
        encoderK1 = EncoderK(vocab_size,hidden_size,num_layers,p).to(device)
        encoderD1 = EncoderD(vocab_size,hidden_size,num_layers,p).to(device)
        
        
    else:
        with open('model'+str(model_num)+'.pkl', 'rb') as input:
            encoderK1 = pickle.load(input)
            encoderD1 = pickle.load(input)
            plot_losses = pickle.load(input)
            start_from = pickle.load(input)
    return encoderK1, encoderD1, plot_losses, start_from

### Train function

In [None]:
def train_model(num_epochs = 1,plot_every = 500,print_every = 5000,save_every = 50000,start_from = 0):
    
    encoderD1.train()
    encoderK1.train()

    for ep in range(num_epochs):
        if num_epochs != 1:
            print("epoch: ",ep+1)
            print("=================================================================================================================")
        
        global batch_count
        batch_count = 0
        plot_loss_total = 0
        print_loss_total = 0
        start = time.time()
        
        

        
        for idx, (scores, keys, descs) in enumerate(train_loader):

            encoderK_optimizer.zero_grad()
            encoderD_optimizer.zero_grad()

            keys = keys.to(device)
            scores = scores.to(device)
            descs = descs.to(device)


            encodedK, hidden, cell = encoderK1(keys)


            prediction = encoderD1(descs,encodedK,hidden,cell)

            loss = criteria(prediction, scores)

            loss.backward()

            encoderK_optimizer.step()
            encoderD_optimizer.step()


            plot_loss_total += loss
            print_loss_total += loss
            batch_count+=1
            
            if batch_count % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0

            if batch_count % print_every == 0:

                print('%s (%d %d%%) %.4f' % (timeSince(start, (batch_count) / (total_batches)),batch_count+start_from, (batch_count+start_from) / (total_batches+start_from) * 100, print_loss_total/print_every))
                print_loss_total = 0
                
            if batch_count % save_every == 0:
                num = (batch_count//save_every)%2+1
                with open('model'+str(num)+'.pkl', 'wb') as output:
                    pickle.dump(encoderK1, output, pickle.HIGHEST_PROTOCOL)
                    pickle.dump(encoderD1, output, pickle.HIGHEST_PROTOCOL)
                    pickle.dump(plot_losses, output, pickle.HIGHEST_PROTOCOL)
                    pickle.dump(batch_count+start_from, output, pickle.HIGHEST_PROTOCOL)
                print("Saved model"+str(num)+".\nBatch count: ",batch_count)
                
        with open('model'+str(num%2+1)+'.pkl', 'wb') as output:
            pickle.dump(encoderK1, output, pickle.HIGHEST_PROTOCOL)
            pickle.dump(encoderD1, output, pickle.HIGHEST_PROTOCOL)
            pickle.dump(plot_losses, output, pickle.HIGHEST_PROTOCOL)
            pickle.dump(0, output, pickle.HIGHEST_PROTOCOL)
        print("Saved model"+str(num)+". End of epoch ",ep)

### Evaluate function

In [None]:
def evaluate(test_loader,encoderK1,encoderD1,threshold = 0.5,sample_size = 10000,print_index = False):
    with torch.no_grad():
        encoderK1.eval()
        encoderD1.eval()
        
        right_num = 0
        total_num = 0
        total_loss = 0.0
        
        pbar = tqdm(total = sample_size,position=0, leave=True)
        
        for idx, (scores, keys, descs) in enumerate(test_loader):
            
            
            keys = keys.to(device)
            scores = scores.to(device)
            descs = descs.to(device)


            encodedK, hidden, cell = encoderK1(keys)


            prediction = encoderD1(descs,encodedK,hidden,cell)
            
            loss = criteria(prediction, scores)
            
            predic_prob = F.log_softmax(prediction)

            if predic_prob[0][scores.item()].item() > math.log(threshold):
                right_num += 1
                total_num += 1
                
            else:
                total_num += 1
                if print_index == True:
                    print(idx,predic_prob[0][scores.item()].item())
                
            total_loss += loss.item()
            
            pbar.update()
            
            if total_num == sample_size:
                break
            
    encoderK1.train()
    encoderD1.train()
    
    return total_loss/total_num, right_num, total_num

# Start running

### Define hyperparameters

In [None]:
learning_rate = 0.0001
num_epochs = 1
batch_size = 1


hidden_size = 512
num_layers = 1
p = 0.2

plot_every = 500 #unit: batchs
print_every = 5000


### Create model instance

In [None]:

encoderK1, encoderD1, plot_losses, start_from = get_encoders(new = True,model_num =1) # new = False for loading model using pickle

encoderK_optimizer = optim.SGD(encoderK1.parameters(), lr=learning_rate,momentum=0.9)
encoderD_optimizer = optim.SGD(encoderD1.parameters(), lr=learning_rate,momentum=0.9)
criteria = nn.CrossEntropyLoss()

### Load data

In [None]:
train_loader, dataset, total_batches, vocab_size = get_or_load_data(batch_size = batch_size,from_csv_file = False,save = False,start_from=start_from)
print(vocab_size)

### Train

In [None]:
train_model(num_epochs = 1,plot_every = 5000,print_every = 50000,save_every = 500000,start_from = start_from)

In [None]:
#used for saving model when training accidently ended

num = 2

with open('model'+str(num)+'.pkl', 'wb') as output:
    pickle.dump(encoderK1, output, pickle.HIGHEST_PROTOCOL)
    pickle.dump(encoderD1, output, pickle.HIGHEST_PROTOCOL)
    pickle.dump(plot_losses, output, pickle.HIGHEST_PROTOCOL)
    pickle.dump(batch_count+start_from, output, pickle.HIGHEST_PROTOCOL)
print("Saved model"+str(num)+".\nBatch count: ",batch_count+start_from)

In [None]:
showPlot(plot_losses)

### Evaluate

In [None]:
test_loader, test_dataset, test_total_batches, _ = get_or_load_data(batch_size = 1,from_csv_file = True,save = False,csv_file = "test.csv",vocab = dataset.vocab)

In [None]:
avg_loss, right_num, total_num = evaluate(test_loader,encoderK1,encoderD1,threshold = 0.5,sample_size = 5000,print_index=True)
print("right: ",right_num,"  out of: ",total_num,"  Accuracy: ",right_num/total_num)
print("avg loss: ",avg_loss)

In [None]:
print("Done!")