### Dependencies ###

In [158]:
import os
import torch
import numpy as np
import time
import pandas as pd

from scipy import spatial
print(os.listdir('train'))

['lcp_multi_train.tsv', 'lcp_single_train.tsv']


### Train data 

In [4]:
train_single_tsv = 'train/lcp_single_train.tsv'
df_train_single = pd.read_csv(train_single_tsv, sep='\t', header=0)

In [5]:
print("Data columns: \n")
print(df_train_single.columns)
print("Total corpus len: {}".format(len(df_train_single)))
print("Subcorpus len:\n")
print(df_train_single['corpus'].value_counts())

Data columns: 

Index(['id', 'corpus', 'sentence', 'token', 'complexity'], dtype='object')
Total corpus len: 7662
Subcorpus len:

biomed      2576
bible       2574
europarl    2512
Name: corpus, dtype: int64


### Test data ###

In [7]:
test_single_tsv = 'test/lcp_single_trial.tsv'
df_test_single = pd.read_csv(test_single_tsv, sep='\t', header=0)


In [8]:
print("Data columns: \n")
print(df_test_single.columns)
print("Total corpus len: {}".format(len(df_test_single)))
print("Subcorpus len:\n")
print(df_test_single['subcorpus'].value_counts())
print(os.getcwd())

Data columns: 

Index(['id', 'subcorpus', 'sentence', 'token', 'complexity'], dtype='object')
Total corpus len: 311
Subcorpus len:

europarl    143
bible       124
biomed       44
Name: subcorpus, dtype: int64
C:\Users\Simona Mihaylova


### GloVe ###
Load the pretrained GloVe vectors and verify that the operation has been successful by some quick experiments with the embedding.  

In [159]:
glove_w2v_loc = 'InferSent/GloVe/glove.840B.300d.txt'
with open(glove_w2v_loc,  "r", encoding="utf8") as lines:
    glove_w2v = {}
    for line in lines:
        values = line.split()
        word = ''.join(values[:-300])
        vector = np.asarray(values[-300:], dtype='float32')
        glove_w2v[word.lower()] = vector
    print(len(glove_w2v)," words loaded!")

1702835  words loaded!


In [30]:
def find_closest_embeddings(embedding):
    return sorted(glove_w2v.keys(), key=lambda word: spatial.distance.euclidean(glove_w2v[word.lower()], embedding))[0:5]

In [160]:
print("Demo of closest words:")
find_closest_embeddings(glove_w2v['baby'])

Demo of closest words:


['baby', 'rank_7', 'pushposters.com', 'seitp202', '765.361.6100']

In [15]:
print("Demo of word arithmetics:")
find_closest_embeddings(np.array(glove_w2v['king']) - np.array(glove_w2v['man']) + np.array(glove_w2v['woman']))

Demo of word arithmetics:


['woman', 'king', 'halfsugar', 'mattjgilbert', 'zephyp']

### InferSent
- https://towardsdatascience.com/learning-sentence-embeddings-by-natural-language-inference-a50b4661a0b8
- https://research.fb.com/downloads/infersent/

Load InferSent model and execute some experiments.  
**To Do:** Currently it is using GloVe. We should choose between GloVe or fastText vectors.

In [32]:
from InferSent.models import InferSent

In [33]:
model_pkl = 'InferSent/encoder/infersent1.pkl'
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': 1}
infer_sent_model = InferSent(params_model)
infer_sent_model.load_state_dict(torch.load(model_pkl))

<All keys matched successfully>

In [34]:
infer_sent_model.set_w2v_path(glove_w2v_loc)
infer_sent_model.build_vocab_k_words(K=100000)

# infer_sent_model.to(torch.device("cuda:0"))

Vocab size : 100000


In [35]:
infer_sent_model.encode(["This man is playing computer games"], tokenize=True)

array([[ 0.08556895, -0.02621041,  0.10144137, ..., -0.03926747,
        -0.03814263, -0.02820691]], dtype=float32)

In [36]:
def get_embedding_for_context(ctx):
    if not isinstance(ctx, list):
#       print("ctx is not list")
        ctx = [ctx]
    return infer_sent_model.encode(ctx, tokenize=True)

start = time.time()
get_embedding_for_context("This is a test sentence")
print("Time for single prediction: {}".format(time.time() - start))

get_embedding_for_context(["This is a test sentence"] * 3000)
print("Time for pred of 3000 cases: {}".format(time.time() - start))

Time for single prediction: 0.08101630210876465
Time for pred of 3000 cases: 19.849721431732178


In [38]:
print(len(all_sentences))
print(len(all_sentence_embeddings))

NameError: name 'all_sentences' is not defined

In [39]:
from sklearn.metrics.pairwise import cosine_similarity

def measure_dist_between_ctx(c1, c2):
    e1 = get_embedding_for_context(c1)[0]
    e2 = get_embedding_for_context(c2)[0]
    #return spatial.distance.euclidean(e1, e2)
    return cosine_similarity([e1], [e2])

print(measure_dist_between_ctx("In India people are going to war.", "The family went to an indian restaurant."))
print(measure_dist_between_ctx("The baby is hungry.", "The child needs to eat."))
print(measure_dist_between_ctx("Programming takes ages to master.", "Ronaldo scored a goal against man united."))
print(measure_dist_between_ctx("At the university students go to lectures.", "Ronaldo scored a goal against man united."))
print(measure_dist_between_ctx("A soccer game with multiple males playing.", "Some men are playing a sport."))
print(measure_dist_between_ctx("The man is cooking chicken with potatoes.", "A man is driving down a lonely road."))
print(measure_dist_between_ctx("The man is cooking chicken with potatoes.", "In the restaurant they serve delicious food."))



[[0.47350663]]
[[0.662904]]
[[0.3901102]]
[[0.34307456]]
[[0.7366814]]
[[0.42583603]]
[[0.6617246]]


### Handcrafted features

* Word length
* Syllable count
* **To Do:** word frequency

In [80]:
import nltk
import csv

reader = csv.reader(open('SUBTLEX.csv', 'r'))
frequency = {}

for row in reader:
   frequency[row[0].lower()] = row[5]



In [121]:
import syllables

    
# According to the paper there are 3 handcrafted features
# - word lenghth
# - word frequency (TODO)
# - syllable count


def get_handcrafted_features(word):
    word = str(word)   
    return [len(word), syllables.estimate(word), float(frequency.get(word.lower()))]

get_handcrafted_features("Basketball")



[10, 3, 21.39]

### Load datasets

In [127]:
from torch.utils.data import Dataset

def preprocess_embeddings(dataset):
    # Preprocess all sentence embeddings for the data:
    sentence_embeddings = {}
    
    all_sentences = dataset['sentence'].tolist()

    start = time.time()
    all_sentence_embeddings = get_embedding_for_context(all_sentences)
    print("Encoding time for all sentences: {}".format(time.time() - start))
    return all_sentence_embeddings
    

class CompLexDataset(Dataset):
    global dataset_type
    
    def __init__(self, dataset_type):
        self.dataset_type = dataset_type
        
        if(self.dataset_type == 'train'):                   
            self.all_sentence_embeddings = preprocess_embeddings(df_train_single)
        elif(self.dataset_type == 'test'):
            self.all_sentence_embeddings = preprocess_embeddings(df_test_single)
    
    def __len__(self):
        if(self.dataset_type == 'train'):                   
            return len(df_train_single)
        elif(self.dataset_type == 'test'):
            return len(df_test_single)
        else: 
            raise Exception("Invalid dataset type.", self.dataset_type)

    
    def __getitem__(self,idx):
        start = time.time()
        if(self.dataset_type == 'train'):
            token = df_train_single.loc[idx, 'token']
            token = str(token)
            out = df_train_single.loc[idx, 'complexity']
        elif(self.dataset_type == 'test'):
            token = df_test_single.loc[idx, 'token']
            token = str(token)
            out = df_test_single.loc[idx, 'complexity']
        else: 
            raise Exception("Invalid dataset type.", self.dataset_type)
        
        handcrafted_features = get_handcrafted_features(token)

        sentence_ctx = self.all_sentence_embeddings[idx]
        
        if token.lower() in glove_w2v:   
            w2v_for_token = glove_w2v[token.lower()]
        else:
            #print("Token {} not found".format(token.lower()))
            w2v_for_token = [0] * 300
        
        
        result = {
            'inp': torch.from_numpy(np.hstack((np.array(handcrafted_features), sentence_ctx, np.array(w2v_for_token))).ravel()).float(), 
            'out': torch.from_numpy(np.array([out])).float()
        }
        
        #print("Idx {} fetch time: {}".format(idx, time.time() - start))
        return result
    

In [128]:
train_dataset = CompLexDataset("train")
print("Input: ", train_dataset[5]['inp'], "Input Length: ", len(train_dataset[5]['inp']))
print("Output: ", train_dataset[5]['out'])

test_dataset = CompLexDataset("test")
print("Input: ", test_dataset[5]['inp'], "Input Length: ", len(test_dataset[5]['inp']))
print("Output: ", test_dataset[5]['out'])

Encoding time for all sentences: 238.0510733127594
Input:  tensor([12.0000,  4.0000,  0.2400,  ...,  0.7953, -0.6708, -0.6317]) Input Length:  4399
Output:  tensor([0.3750])
Encoding time for all sentences: 54.997962474823
Input:  tensor([ 4.0000,  1.0000, 88.1200,  ...,  0.2892, -0.4453, -0.6912]) Input Length:  4399
Output:  tensor([0.0250])


### Network ###

In [129]:
import torch.nn as nn
import torch.nn.functional as F

class Network(nn.Module):

    def __init__(self):
        super().__init__()
        
        self.fc1 = nn.Linear(len(train_dataset[0]['inp']), 1600)
        self.b1 = nn.BatchNorm1d(1600)
        self.fc2 = nn.Linear(1600, 1)
        self.softmax = nn.Softmax(dim = 0) 


    def forward(self,x):

        x = self.fc1(x)
        #x = self.b1(x)
        x = self.fc2(x)

        return x
        #return self.softmax(x)
        
    
net = Network()
print(net)
#net.to(torch.device("cuda:0"))
train_dataset[0]

Network(
  (fc1): Linear(in_features=4399, out_features=1600, bias=True)
  (b1): BatchNorm1d(1600, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=1600, out_features=1, bias=True)
  (softmax): Softmax(dim=0)
)


{'inp': tensor([ 5.0000,  2.0000, 14.6500,  ...,  0.3022,  0.3209,  1.2411]),
 'out': tensor([0.2143])}

In [130]:
def train(model, x, y, optimizer, criterion):
    model.zero_grad()
    output = model(x)
    loss = criterion(output,y)
    loss.backward()
    optimizer.step()

    return loss, output


In [131]:
print(torch.cuda.is_available())
print(torch.cuda.current_device())

True
0


### Mean Squared Error ###
Training phase

In [132]:
from torch.optim import Adam
from torch.utils.data import DataLoader
from tqdm import tqdm
import time

criterion = nn.MSELoss()
EPOCHS = 24
BATCH_SIZE = 64
optm = Adam(net.parameters(), lr = 0.00001)

data_train = DataLoader(dataset = train_dataset, batch_size = BATCH_SIZE, shuffle = True)

for epoch in range(EPOCHS):
    epoch_loss = 0
    correct = 0
    
    for bidx, batch in enumerate(data_train):
        #start = time.time()
        x_train = batch['inp']
        y_train = batch['out']
        #print("Fetch time: {}".format(time.time() - start))
        
        #start = time.time()
        loss, predictions = train(net,x_train,y_train, optm, criterion)
        epoch_loss+=loss
        #print("Predict time: {}".format(time.time() - start))
        
    print('Epoch {} Loss : {}'.format((epoch+1),epoch_loss))

Epoch 1 Loss : 5.086705684661865
Epoch 2 Loss : 1.5304296016693115
Epoch 3 Loss : 1.3516559600830078
Epoch 4 Loss : 1.2493255138397217
Epoch 5 Loss : 1.1982018947601318
Epoch 6 Loss : 1.1727761030197144
Epoch 7 Loss : 1.145980954170227
Epoch 8 Loss : 1.1190011501312256
Epoch 9 Loss : 1.1269150972366333
Epoch 10 Loss : 1.0850476026535034
Epoch 11 Loss : 1.0977871417999268
Epoch 12 Loss : 1.0771631002426147
Epoch 13 Loss : 1.0818724632263184
Epoch 14 Loss : 1.064705729484558
Epoch 15 Loss : 1.0509209632873535
Epoch 16 Loss : 1.0449378490447998
Epoch 17 Loss : 1.0319753885269165
Epoch 18 Loss : 1.017499566078186
Epoch 19 Loss : 1.0253534317016602
Epoch 20 Loss : 1.0041208267211914
Epoch 21 Loss : 0.9970840215682983
Epoch 22 Loss : 1.0309712886810303
Epoch 23 Loss : 0.9920381307601929
Epoch 24 Loss : 0.9720224142074585


### Output for single sample

In [133]:
net(train_dataset[210]['inp'])

tensor([0.2903], grad_fn=<AddBackward0>)

### Mean Absolute Error ###

#### MAE for test dataset

In [157]:
from sklearn.metrics import mean_absolute_error

y_true = [test_dataset[i]['out'].item() for i in range(len(test_dataset))]
y_pred = []

test_loader = DataLoader(dataset = test_dataset, batch_size = BATCH_SIZE, shuffle = False)
for bidx, batch in enumerate(test_loader):
        #start = time.time()
        x_train = batch['inp']
        y_pred.append(net(x_train))

y_pred = [x.item() for i in range(len(y_pred)) for x in y_pred[i] ]

mae = mean_absolute_error(y_true, y_pred)
print("MAE for test data: ", mae)

MAE for test data:  0.08643300351581965


#### MAE for train dataset

In [153]:
from sklearn.metrics import mean_absolute_error

y_true = [train_dataset[i]['out'].item() for i in range(len(train_dataset))]
y_pred = []

test_loader = DataLoader(dataset = train_dataset, batch_size = BATCH_SIZE, shuffle = False)
for bidx, batch in enumerate(test_loader):
        #start = time.time()
        x_train = batch['inp']
        y_pred.append(net(x_train))

y_pred = [x.item() for i in range(len(y_pred)) for x in y_pred[i] ]

mae = mean_absolute_error(y_true, y_pred)
print("MAE for train data: ", mae)

MAE for train data:  0.06779823770515156


#### MAE for total random

In [154]:
from sklearn.metrics import mean_absolute_error
import random

y_true = [train_dataset[i]['out'].item() for i in range(len(train_dataset))]
y_pred = [random.random() for i in range(len(train_dataset))]


mae = mean_absolute_error(y_true, y_pred)
print("Mean Absolute Error for train data: ", mae)

Mean Absolute Error for train data:  0.3085268124368329
