### Dependencies ###

In [1]:
import os
import torch
import numpy as np
import time
import pandas as pd

from scipy import spatial
print(os.getcwd())
print(os.listdir('../dataset/train'))

/home/borisovai/Documents/AI/NLP_Course/SemEval2021/baseline
['lcp_multi_train.tsv', 'lcp_single_train.tsv']


### Train data ###

In [2]:
train_single_tsv = '../dataset/train/lcp_single_train.tsv'
df_train_single = pd.read_csv(train_single_tsv, sep='\t', header=0)

In [3]:
print("Data columns: \n")
print(df_train_single.columns)
print("Total corpus len: {}".format(len(df_train_single)))
print("Subcorpus len:\n")
print(df_train_single['corpus'].value_counts())

Data columns: 

Index(['id', 'corpus', 'sentence', 'token', 'complexity'], dtype='object')
Total corpus len: 7662
Subcorpus len:

biomed      2576
bible       2574
europarl    2512
Name: corpus, dtype: int64


### Test data 

In [4]:
test_single_tsv = '../dataset/test/lcp_single_test.tsv'
df_test_single = pd.read_csv(test_single_tsv, sep='\t', header=0)


In [5]:
print("Data columns: \n")
print(df_test_single.columns)
print("Total corpus len: {}".format(len(df_test_single)))
print("Subcorpus len:\n")
print(df_test_single['corpus'].value_counts())
print(os.getcwd())

Data columns: 

Index(['id', 'corpus', 'sentence', 'token', 'complexity'], dtype='object')
Total corpus len: 808
Subcorpus len:

europarl    345
bible       233
biomed      230
Name: corpus, dtype: int64
/home/borisovai/Documents/AI/NLP_Course/SemEval2021/baseline


### Trial data 


In [6]:
trial_single_tsv = '../dataset/trial/lcp_single_trial.tsv'
df_trial_single = pd.read_csv(trial_single_tsv, sep='\t', header=0)


In [7]:
print("Data columns: \n")
print(df_trial_single.columns)
print("Total corpus len: {}".format(len(df_trial_single)))
print("Subcorpus len:\n")
print(df_trial_single['subcorpus'].value_counts())
print(os.getcwd())

Data columns: 

Index(['id', 'subcorpus', 'sentence', 'token', 'complexity'], dtype='object')
Total corpus len: 421
Subcorpus len:

europarl    143
bible       143
biomed      135
Name: subcorpus, dtype: int64
/home/borisovai/Documents/AI/NLP_Course/SemEval2021/baseline


### GloVe ###
Load the pretrained GloVe vectors and verify that the operation has been successful by some quick experiments with the embedding.  

In [8]:
glove_w2v_loc = 'InferSent/GloVe/glove.6B.300d.txt'
with open(glove_w2v_loc,  "r", encoding="utf8") as lines:
    glove_w2v = {}
    for line in lines:
        values = line.split()
        word = ''.join(values[:-300])
        vector = np.asarray(values[-300:], dtype='float32')
        glove_w2v[word.lower()] = vector
    print(len(glove_w2v)," words loaded!")

400000  words loaded!


In [9]:
def find_closest_embeddings(embedding):
    return sorted(glove_w2v.keys(), key=lambda word: spatial.distance.euclidean(glove_w2v[word.lower()], embedding))[0:5]

In [10]:
print("Demo of closest words:")
find_closest_embeddings(glove_w2v['baby'])

Demo of closest words:


['baby', 'babies', 'newborn', 'infant', 'birth']

In [11]:
print("Demo of word arithmetics:")
find_closest_embeddings(np.array(glove_w2v['king']) - np.array(glove_w2v['man']) + np.array(glove_w2v['woman']))

Demo of word arithmetics:


['king', 'queen', 'monarch', 'mother', 'princess']

### S-Bert

In [12]:
from sentence_transformers import SentenceTransformer

In [31]:
sbert_model = SentenceTransformer('stsb-distilbert-base')

In [32]:
def get_embedding_for_context(ctx):
    if not isinstance(ctx, list):
#       print("ctx is not list")
        ctx = [ctx]
    return sbert_model.encode(ctx)

start = time.time()
get_embedding_for_context("This is a test sentence")
print("Time for single prediction: {}".format(time.time() - start))

get_embedding_for_context(["This is a test sentence"] * 3000)
print("Time for pred of 3000 cases: {}".format(time.time() - start))

Time for single prediction: 0.04930377006530762
Time for pred of 3000 cases: 3.194972515106201


### Handcrafted features

* Word length
* Syllable count
* Word frequency

In [33]:
import syllables
# According to the paper there are 3 handcrafted features
# - word length
# - word frequency 
# - syllable count
import csv
import math
import nltk

from collections import defaultdict
from nltk.stem.porter import PorterStemmer

reader = csv.reader(open('SUBTLEX.csv', 'r'))
frequency = defaultdict(float)
frequency_count = dict()
stemmer = PorterStemmer()

for row in reader:
    token = stemmer.stem(row[0].lower())
    frequency[token] += float(row[5])
    
#     if token in frequency_count:
#         frequency_count[token] += 1
#     else:
#         frequency_count[token] = 1
    
# for key in frequency:
#     frequency[key] = frequency[key]/frequency_count[key]

frequency = {k: math.log2(v) for k, v in frequency.items()}

def get_handcrafted_features(word):
    word = str(word)
    return [len(word), syllables.estimate(word), frequency.get(stemmer.stem(word.lower())) or 0]

get_handcrafted_features("Basketball")

print(frequency)

{'\ufeffthe': 14.845939842929342, 'to': 14.469005792434924, 'a': 14.317361027449776, 'you': 15.353216449295463, 'and': 13.708726110836997, 'it': 14.218867423279539, 's': 14.339529229179508, 'of': 13.499110901833564, 'for': 12.751355760035558, 'i': 15.286671821258611, 'in': 13.254724532896342, 'on': 12.764121028281513, 'is': 13.137791675355247, 'that': 13.784973193108685, 'what': 13.265094891049188, 'be': 12.608368033514354, 'have': 12.655295919518291, 'are': 12.347751459914683, 'thi': 12.966343220703566, 'no': 12.544003281657064, 'we': 13.13761558954651, 'me': 13.173980008013814, 't': 13.811693785252627, 'there': 12.086540972282508, 'can': 12.360660564018824, 'here': 12.143813662214201, 'with': 12.301601935395265, 'but': 12.110137439155412, 'he': 12.899047201215264, 'now': 11.645068445838453, 'so': 12.051443478771702, 'my': 12.723390040398842, 'not': 12.405396784797198, 'do': 12.80714961323343, 'your': 12.699546427285094, 'at': 11.651545065931519, 'one': 11.632159136650143, 'up': 11.84

### Load datasets

In [34]:
from torch.utils.data import Dataset

def preprocess_embeddings(dataset):
    # Preprocess all sentence embeddings for the data:
    sentence_embeddings = {}
    
    all_sentences = dataset['sentence'].tolist()

    start = time.time()
    all_sentence_embeddings = get_embedding_for_context(all_sentences)
    print("Encoding time for all sentences: {}".format(time.time() - start))
    return all_sentence_embeddings
    

class CompLexDataset(Dataset):
    global dataset_type
    
    def __init__(self, dataset_type):
        self.dataset_type = dataset_type
        
        if(self.dataset_type == 'train'):                   
            self.all_sentence_embeddings = preprocess_embeddings(df_train_single)
        elif(self.dataset_type == 'trial'):
            self.all_sentence_embeddings = preprocess_embeddings(df_trial_single)
        elif(self.dataset_type == 'test'):
            self.all_sentence_embeddings = preprocess_embeddings(df_test_single)
    
    def __len__(self):
        if(self.dataset_type == 'train'):                   
            return len(df_train_single)
        elif(self.dataset_type == 'trial'):
            return len(df_trial_single)
        elif(self.dataset_type == 'test'):
            return len(df_test_single)
        else: 
            raise Exception("Invalid dataset type.", self.dataset_type)

    
    def __getitem__(self,idx):
        start = time.time()
        if(self.dataset_type == 'train'):
            token = df_train_single.loc[idx, 'token']
            token = str(token)
            out = df_train_single.loc[idx, 'complexity']
        elif(self.dataset_type == 'trial'):
            token = df_trial_single.loc[idx, 'token']
            token = str(token)
            out = df_trial_single.loc[idx, 'complexity']
        elif(self.dataset_type == 'test'):
            token = df_test_single.loc[idx, 'token']
            token = str(token)
            out = df_test_single.loc[idx, 'complexity']
        else: 
            raise Exception("Invalid dataset type.", self.dataset_type)
        
        handcrafted_features = get_handcrafted_features(token)

        sentence_ctx = self.all_sentence_embeddings[idx]
        
        if token.lower() in glove_w2v:   
            w2v_for_token = glove_w2v[token.lower()]
        else:
            #print("Token {} not found".format(token.lower()))
            w2v_for_token = [0] * 300
        
        
        result = {
            'inp': torch.from_numpy(np.hstack((np.array(handcrafted_features), sentence_ctx, np.array(w2v_for_token))).ravel()).float(), 
            'out': torch.from_numpy(np.array([out])).float()
        }
        
        #print("Idx {} fetch time: {}".format(idx, time.time() - start))
        return result
    

In [35]:
train_dataset = CompLexDataset("train")
print("Input: ", train_dataset[5]['inp'], "Input Length: ", len(train_dataset[5]['inp']))
print("Output: ", train_dataset[5]['out'])

trial_dataset = CompLexDataset("trial")
print("Input: ", trial_dataset[5]['inp'], "Input Length: ", len(trial_dataset[5]['inp']))
print("Output: ", trial_dataset[5]['out'])

test_dataset = CompLexDataset("test")
print("Input: ", test_dataset[5]['inp'], "Input Length: ", len(test_dataset[5]['inp']))
print("Output: ", test_dataset[5]['out'])

Encoding time for all sentences: 53.13969278335571
Input:  tensor([12.0000,  4.0000, -0.3771,  ...,  0.2898,  0.1594, -0.2284]) Input Length:  1071
Output:  tensor([0.3750])
Encoding time for all sentences: 3.429360866546631
Input:  tensor([ 4.0000,  1.0000,  6.9694,  ..., -0.1070,  0.1319, -0.1578]) Input Length:  1071
Output:  tensor([0.0250])
Encoding time for all sentences: 5.819107294082642
Input:  tensor([ 4.0000,  1.0000,  4.1035,  ..., -0.0326,  0.3322, -0.7417]) Input Length:  1071
Output:  tensor([nan])


### Network ###

In [91]:
import torch.nn as nn
import torch.nn.functional as F

class Network(nn.Module):

    def __init__(self):
        super().__init__()
        
        self.fc1 = nn.Linear(len(train_dataset[0]['inp']), 1600)
        self.fc2 = nn.Linear(1600, 1600)
        self.fc3 = nn.Linear(1600, 1)

        self.sigmoid = nn.Sigmoid() 


    def forward(self,x):

        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)

        #return x
        return self.sigmoid(x)
        
    
net = Network()
print(net)
#net.to(torch.device("cuda:0"))
train_dataset[0]

Network(
  (fc1): Linear(in_features=1071, out_features=1600, bias=True)
  (fc2): Linear(in_features=1600, out_features=1600, bias=True)
  (fc3): Linear(in_features=1600, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


{'inp': tensor([ 5.0000,  2.0000,  3.8728,  ...,  0.0829, -0.5269,  0.1076]),
 'out': tensor([0.2143])}

In [92]:
def train(model, x, y, optimizer, criterion):
    model.zero_grad()
    output = model(x)
 #   print("output", output)
 #   print("y", y)
    loss = criterion(output,y)
    loss.backward()
    optimizer.step()

    return loss, output


In [93]:
print(torch.cuda.is_available())
print(torch.cuda.current_device())

True
0


### Mean Squared Error ###
Training phase

In [94]:
from torch.optim import Adam
from torch.utils.data import DataLoader
from tqdm import tqdm
import time

criterion = nn.MSELoss()
EPOCHS = 30
BATCH_SIZE = 64
optm = Adam(net.parameters(), lr = 0.00001)

data_train = DataLoader(dataset = train_dataset, batch_size = BATCH_SIZE, shuffle = True)

for epoch in range(EPOCHS):
    epoch_loss = 0
    correct = 0
    
    for bidx, batch in enumerate(data_train):
        #start = time.time()
        x_train = batch['inp']
        y_train = batch['out']
        #print("Fetch time: {}".format(time.time() - start))
        
        #start = time.time()
        loss, predictions = train(net,x_train,y_train, optm, criterion)
        epoch_loss += loss
        #print("Predict time: {}".format(time.time() - start))
        
    print('Epoch {} Loss : {}'.format((epoch+1),epoch_loss))

Epoch 1 Loss : 1.516079306602478
Epoch 2 Loss : 0.9852264523506165
Epoch 3 Loss : 0.914534866809845
Epoch 4 Loss : 0.8748611211776733
Epoch 5 Loss : 0.8495355248451233
Epoch 6 Loss : 0.8376185894012451
Epoch 7 Loss : 0.8243715763092041
Epoch 8 Loss : 0.8150088787078857
Epoch 9 Loss : 0.8112455010414124
Epoch 10 Loss : 0.8045272827148438
Epoch 11 Loss : 0.8015268445014954
Epoch 12 Loss : 0.7961103916168213
Epoch 13 Loss : 0.7903420925140381
Epoch 14 Loss : 0.7914972305297852
Epoch 15 Loss : 0.7879045009613037
Epoch 16 Loss : 0.7853009700775146
Epoch 17 Loss : 0.7775622010231018
Epoch 18 Loss : 0.77826327085495
Epoch 19 Loss : 0.7745904922485352
Epoch 20 Loss : 0.7770098447799683
Epoch 21 Loss : 0.7737761735916138
Epoch 22 Loss : 0.7737058401107788
Epoch 23 Loss : 0.7762865424156189
Epoch 24 Loss : 0.767096757888794
Epoch 25 Loss : 0.7710388898849487
Epoch 26 Loss : 0.7699673175811768
Epoch 27 Loss : 0.7676612138748169
Epoch 28 Loss : 0.7623043060302734
Epoch 29 Loss : 0.7674205303192139

### Output for single sample

In [95]:
net(train_dataset[210]['inp'])

tensor([0.2625], grad_fn=<SigmoidBackward>)

### Mean Absolute Error ###

#### MAE for test dataset

In [96]:
# from sklearn.metrics import mean_absolute_error

# y_true = [test_dataset[i]['out'].item() for i in range(len(test_dataset))]
# y_pred = []

# test_loader = DataLoader(dataset = test_dataset, batch_size = BATCH_SIZE, shuffle = False)
# for bidx, batch in enumerate(test_loader):
#         #start = time.time()
#         x_train = batch['inp']
#         y_pred.append(net(x_train))

# y_pred = [x.item() for i in range(len(y_pred)) for x in y_pred[i] ]

# mae = mean_absolute_error(y_true, y_pred)
# print("MAE for test data: ", mae)

# with open('test_results.csv', 'w', newline='') as f:
#     f_writer = csv.writer(f, delimiter=',',)
#     for idx in range(len(df_test_single)):
#        f_writer.writerow((df_test_single.loc[idx, 'id'], str(y_pred[idx])))


#### MAE for trial dataset

In [102]:
from sklearn.metrics import mean_absolute_error

y_true = [trial_dataset[i]['out'].item() for i in range(len(trial_dataset))]
y_pred = []

trial_loader = DataLoader(dataset = trial_dataset, batch_size = BATCH_SIZE, shuffle = False)
for bidx, batch in enumerate(trial_loader):
        #start = time.time()
        x_train = batch['inp']
        y_pred.append(net(x_train))

y_pred = [x.item() for i in range(len(y_pred)) for x in y_pred[i] ]

mae = mean_absolute_error(y_true, y_pred)
print("MAE for trial data: ", mae)

MAE for trial data:  0.06923390283401958


#### MAE for train dataset

In [98]:
from sklearn.metrics import mean_absolute_error

y_true = [train_dataset[i]['out'].item() for i in range(len(train_dataset))]
y_pred = []

test_loader = DataLoader(dataset = train_dataset, batch_size = BATCH_SIZE, shuffle = False)
for bidx, batch in enumerate(test_loader):
        #start = time.time()
        x_train = batch['inp']
        y_pred.append(net(x_train))

y_pred = [x.item() for i in range(len(y_pred)) for x in y_pred[i] ]

mae = mean_absolute_error(y_true, y_pred)
print("MAE for train data: ", mae)

MAE for train data:  0.0614881131207778


#### MAE for total random

In [99]:
from sklearn.metrics import mean_absolute_error
import random

y_true = [train_dataset[i]['out'].item() for i in range(len(train_dataset))]
y_pred = [random.random() for i in range(len(train_dataset))]


mae = mean_absolute_error(y_true, y_pred)
print("Mean Absolute Error for train data: ", mae)

Mean Absolute Error for train data:  0.3069512465588618


#### Demo

In [100]:
def prepare_sentence(sentence, token):
    sentence_embeddings = get_embedding_for_context(sentence)[0]
    handcrafted_features = get_handcrafted_features(token)
            
    if token.lower() in glove_w2v:   
        w2v_for_token = glove_w2v[token.lower()]
    else:
       w2v_for_token = [0] * 300
    
    return {
            'inp': torch.from_numpy(np.hstack((np.array(handcrafted_features), sentence_embeddings, np.array(w2v_for_token))).ravel()).float() 
           }

    

In [101]:
sentence = 'Peter loves pineapples and apples!'
token1 = 'pineapples'
token2 = 'apples'

y_pred1 = net(prepare_sentence(sentence, token1)['inp'])
print('SENTENCE: ', sentence, '\nTOKEN: ', token1, '\nCOMPLEXITY: ', y_pred1.item(), '\n')

y_pred2 = net(prepare_sentence(sentence, token2)['inp'])
print('SENTENCE: ', sentence, '\nTOKEN: ', token2, '\nCOMPLEXITY: ', y_pred2.item(), '\n')



SENTENCE:  Peter loves pineapples and apples! 
TOKEN:  pineapples 
COMPLEXITY:  0.3920523524284363 

SENTENCE:  Peter loves pineapples and apples! 
TOKEN:  apples 
COMPLEXITY:  0.2218542844057083 

