In [2]:
import os
import torch
import numpy as np
import time
from scipy import spatial
print(os.listdir('train'))

['lcp_multi_train.tsv', 'lcp_single_train.tsv']


In [3]:
import pandas as pd
train_single_tsv = 'train/lcp_single_train.tsv'
df_train_single = pd.read_csv(train_single_tsv, sep='\t', header=0)

In [4]:
print("Data columns: \n")
print(df_train_single.columns)
print("Total corpus len: {}".format(len(df_train_single)))
print("Subcorpus len:\n")
print(df_train_single['corpus'].value_counts())

Data columns: 

Index(['id', 'corpus', 'sentence', 'token', 'complexity'], dtype='object')
Total corpus len: 7232
Subcorpus len:

biomed      2576
europarl    2512
bible       2144
Name: corpus, dtype: int64


In [5]:
glove_w2v_loc = 'InferSent/glove.6B.300d.txt'
with open(glove_w2v_loc,  "r", encoding="utf8") as lines:
    glove_w2v = {}
    for line in lines:
        split = line.split()
        word = split[0]
        vector = [float(i) for i in split[1:]]
        glove_w2v[word.lower()] = vector

In [6]:
def find_closest_embeddings(embedding):
    return sorted(glove_w2v.keys(), key=lambda word: spatial.distance.euclidean(glove_w2v[word.lower()], embedding))[0:5]

In [7]:
print("Demo of closest words:")
find_closest_embeddings(glove_w2v['baby'])

Demo of closest words:


['baby', 'babies', 'newborn', 'infant', 'birth']

In [8]:
print("Demo of word arithmetics:")
find_closest_embeddings(np.array(glove_w2v['king']) - np.array(glove_w2v['man']) + np.array(glove_w2v['woman']))

Demo of word arithmetics:


['king', 'queen', 'monarch', 'mother', 'princess']

In [9]:
find_closest_embeddings(np.array(glove_w2v['king']))

['king', 'queen', 'monarch', 'prince', 'kingdom']

In [10]:
find_closest_embeddings(np.array(glove_w2v['monk']) - np.array(glove_w2v['man']) + np.array(glove_w2v['woman']))

['monk', 'nun', 'woman', 'nuns', 'monks']

In [11]:
print("Demo of word arithmetics:")
find_closest_embeddings(np.array(glove_w2v['country']) + np.array(glove_w2v['europe']))

Demo of word arithmetics:


['europe', 'country', 'countries', 'european', 'nation']

Usage of InferSent
https://towardsdatascience.com/learning-sentence-embeddings-by-natural-language-inference-a50b4661a0b8


https://research.fb.com/downloads/infersent/

In [12]:
from InferSent.models import InferSent

In [13]:
model_pkl = 'InferSent/encoder/infersent1.pkl'
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': 1}
infer_sent_model = InferSent(params_model)
infer_sent_model.load_state_dict(torch.load(model_pkl))

<All keys matched successfully>

In [14]:
infer_sent_model.set_w2v_path(glove_w2v_loc)
infer_sent_model.build_vocab_k_words(K=100000)

# infer_sent_model.to(torch.device("cuda:0"))

Vocab size : 100000


In [15]:
infer_sent_model.encode(["This man is playing computer games"], tokenize=True)

array([[ 0.17863286,  0.08774211,  0.05200031, ...,  0.00108394,
        -0.0540266 ,  0.03372176]], dtype=float32)

In [21]:
def get_embedding_for_context(ctx):
    if not isinstance(ctx, list):
#       print("ctx is not list")
        ctx = [ctx]
    return infer_sent_model.encode(ctx, tokenize=True)

start = time.time()
get_embedding_for_context("This is a test sentence")
print("Time for single prediction: {}".format(time.time() - start))

get_embedding_for_context(["This is a test sentence"] * 3000)
print("Time for pred of 3000 cases: {}".format(time.time() - start))

Time for single prediction: 0.0425105094909668
Time for pred of 3000 cases: 5.273191928863525


In [17]:
# Preprocess all sentence embeddings for the data:
sentence_embeddings = {}
idx = 600
all_sentences = df_train_single['sentence'].tolist()
all_sentences[idx] == df_train_single.loc[idx, 'sentence']

start = time.time()
all_sentence_embeddings = get_embedding_for_context(all_sentences)
print("Encoding time for all sentences: {}".format(time.time() - start))
    

Encoding time for all sentences: 88.85007405281067


In [18]:
print(len(all_sentences))
print(len(all_sentence_embeddings))

7232
7232


In [22]:
from sklearn.metrics.pairwise import cosine_similarity

def measure_dist_between_ctx(c1, c2):
    e1 = get_embedding_for_context(c1)[0]
    e2 = get_embedding_for_context(c2)[0]
    #return spatial.distance.euclidean(e1, e2)
    return cosine_similarity([e1], [e2])

print(measure_dist_between_ctx("In India people are going to war.", "The family went to an indian restaurant."))
print(measure_dist_between_ctx("The baby is hungry.", "The child needs to eat."))
print(measure_dist_between_ctx("Programming takes ages to master.", "Ronaldo scored a goal against man united."))
print(measure_dist_between_ctx("At the university students go to lectures.", "Ronaldo scored a goal against man united."))
print(measure_dist_between_ctx("A soccer game with multiple males playing.", "Some men are playing a sport."))
print(measure_dist_between_ctx("The man is cooking chicken with potatoes.", "A man is driving down a lonely road."))
print(measure_dist_between_ctx("The man is cooking chicken with potatoes.", "In the restaurant they serve delicious food."))



[[0.7978083]]
[[0.8062569]]
[[0.63243806]]
[[0.6913799]]
[[0.8188079]]
[[0.7767902]]
[[0.8434121]]


In [24]:
import syllables
# According to the paper there are 3 handcrafted features
# - word lenghth
# - word frequency (TODO)
# - syllable count
def get_handcrafted_features(word):
    word = str(word)
    return [len(word), syllables.estimate(word)]

get_handcrafted_features("Basketball")

[10, 3]

In [35]:
from torch.utils.data import Dataset

class CompLexDataset(Dataset):
    def __init__(self):
        pass
    
    def __len__(self):
        return len(df_train_single)
    
    def __getitem__(self, idx):
        start = time.time()
        
        token = df_train_single.loc[idx, 'token']
        token = str(token)
        out = df_train_single.loc[idx, 'complexity']
        
        handcrafted_features = get_handcrafted_features(token)
        sentence_ctx = all_sentence_embeddings[idx]
        
        if token.lower() in glove_w2v:   
            w2v_for_token = glove_w2v[token.lower()]
        else:
            #print("Token {} not found".format(token.lower()))
            w2v_for_token = [0] * 300
        
        
        result = {
            'inp': torch.from_numpy(np.hstack((np.array(handcrafted_features), sentence_ctx, np.array(w2v_for_token))).ravel()).float(),
            'out': torch.from_numpy(np.array([out])).float()
        }
        
        #print("Idx {} fetch time: {}".format(idx, time.time() - start))
        return result
    

dataset = CompLexDataset()

print("Input: ", dataset[5]['inp'], "Input Length: ", len(dataset[5]['inp']))
print("Output: ", dataset[5]['out'])

Input:  tensor([ 4.0000,  1.0000,  0.0877,  ..., -0.2311, -0.5910,  0.4979]) Input Length:  4398
Output:  tensor([0.1607])


In [32]:
import torch.nn as nn
import torch.nn.functional as F

class Network(nn.Module):

    def __init__(self):
        super().__init__()
        
        self.fc1 = nn.Linear(len(dataset[0]['inp']), 1600)
        self.b1 = nn.BatchNorm1d(1600)
        self.fc2 = nn.Linear(1600, 1)

    def forward(self,x):

        x = self.fc1(x)
        #x = self.b1(x)
        x = self.fc2(x)

        return x
    
net = Network()
#net.to(torch.device("cuda:0"))
dataset[0]

{'inp': tensor([5.0000, 2.0000, 0.1231,  ..., 0.5989, 0.3270, 0.6747]),
 'out': tensor([0.])}

In [37]:
def train(model, x, y, optimizer, criterion):
    model.zero_grad()
    output = model(x)
    loss = criterion(output,y)
    loss.backward()
    optimizer.step()

    return loss, output


In [39]:
#print(torch.cuda.is_available())
#print(torch.cuda.current_device())

In [41]:
from torch.optim import Adam
from torch.utils.data import DataLoader
from tqdm import tqdm
import time

criterion = nn.MSELoss()
EPOCHS = 46
BATCH_SIZE = 64
optm = Adam(net.parameters(), lr = 0.001)

data_train = DataLoader(dataset = dataset, batch_size = BATCH_SIZE, shuffle = True)

for epoch in range(EPOCHS):
    epoch_loss = 0
    correct = 0
    
    for bidx, batch in enumerate(data_train):
        #start = time.time()
        x_train = batch['inp']
        y_train = batch['out']
        #print("Fetch time: {}".format(time.time() - start))
        
        #start = time.time()
        loss, predictions = train(net,x_train,y_train, optm, criterion)
        epoch_loss+=loss
        #print("Predict time: {}".format(time.time() - start))
        
    print('Epoch {} Loss : {}'.format((epoch+1),epoch_loss))

Epoch 1 Loss : 3.4574625492095947
Epoch 2 Loss : 0.9443999528884888
Epoch 3 Loss : 0.9375672340393066
Epoch 4 Loss : 0.9005191922187805
Epoch 5 Loss : 0.8740091323852539
Epoch 6 Loss : 0.8657763004302979
Epoch 7 Loss : 0.8575523495674133
Epoch 8 Loss : 0.8625211119651794
Epoch 9 Loss : 0.8669983744621277
Epoch 10 Loss : 0.827931821346283
Epoch 11 Loss : 0.8392904996871948
Epoch 12 Loss : 0.8486968874931335
Epoch 13 Loss : 0.8226974010467529
Epoch 14 Loss : 0.8279427289962769
Epoch 15 Loss : 0.803523600101471
Epoch 16 Loss : 0.8364858627319336
Epoch 17 Loss : 0.8414934873580933
Epoch 18 Loss : 0.8348140716552734
Epoch 19 Loss : 0.7871792912483215
Epoch 20 Loss : 0.7947569489479065
Epoch 21 Loss : 0.8353956937789917
Epoch 22 Loss : 0.8091203570365906
Epoch 23 Loss : 0.8333479762077332
Epoch 24 Loss : 0.8229992389678955
Epoch 25 Loss : 0.8168485164642334
Epoch 26 Loss : 0.8169122338294983
Epoch 27 Loss : 0.8129745125770569
Epoch 28 Loss : 0.7855275869369507
Epoch 29 Loss : 0.8218967318534

In [None]:
#TODO: Implement MAE for the NN and for a random values