## Installing Sentence Transsformer and other models/frameworks

In [33]:
# pip install -U sentence-transformers
# pip install gensim


# Kindly add all your installations and versions if any in this cell.

## Importing necessary libraries. 
In the final version all imports should be stricly enlisted here.

In [34]:
import pandas as pd
import csv
import multiprocessing
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import gensim
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
# import spacy
# from scipy import stats
from sklearn import linear_model

from sentence_transformers import SentenceTransformer, losses, models, util
# from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
# from sentence_transformers.readers import InputExample

from scipy.stats import spearmanr
# import torch 
# from torch.utils.data import DataLoader

## Load dataset: 7 marks
1 Download and unzip the dataset from this link http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz  **1 mark**

2 Complete the code in `read_sts_csv()`. **4.5 marks**

3 Create 3 dataframes one each for train, test and val and print their final shapes. **1.5 marks**

In [35]:
# For preprocessing in sts-train.csv, we removed "Europe Media Monitor (http://emm.newsbrief.eu)" from the entire csv file
# It reduced the number of buggy lines to less than 10 from more than 200.

# For preprocessing in sts-dev.csv, we removed "Europe Media Monitor (http://emm.newsbrief.eu)" from the entire csv file
# It reduced the number of buggy lines to less than 10 from more than 40.

# For preprocessing in sts-test.csv, we removed "Europe Media Monitor (http://emm.newsbrief.eu)" from the entire csv file along with all mentions of stack exchange links.
# It reduced the number of buggy lines from more than 300 to close to 0.

def read_sts_csv(dataset_type="dev", columns=['source', 'type', 'year', 'id', 'score', 'sent_a', 'sent_b']):
  path = INPUT_PATH + "/sts-"+ dataset_type + ".csv"
  df = pd.read_csv(path, sep='\t', header=None, on_bad_lines = 'skip',lineterminator='\n',quoting=csv.QUOTE_NONE, encoding='utf-8')
  df.columns = columns
  # df.describe()
  return df
  
INPUT_PATH = r"stsbenchmark"
df_dev = read_sts_csv("dev") # create the train, dev and test dataframes
df_train = read_sts_csv("train")
df_test = read_sts_csv("test")
df_train = df_train.dropna()
print("dev:",df_dev.shape)
print("train:",df_train.shape)
print("test:",df_test.shape)
# print(df_train['sent_a'])


dev: (1500, 7)
train: (5549, 7)
test: (1379, 7)


## Hyperparameters: 5 Marks
Update this cell with you choosen parameters except, NUM_EPOCHS

In [36]:
cores = multiprocessing.cpu_count() 
print(cores)

12


In [37]:

HUGGING_FACE_SENTENCE_TRANSFORMER_MODEL ='sentence-transformers/all-mpnet-base-v2' # USE THE HUGGAING FACE VERSION OF SENTENCE_TRANSFORMER_TYPE
NON_CONEXTUAL_MODEL_TYPE = 'Word2Vec'
CONEXTUAL_MODEL_TYPE = HUGGING_FACE_SENTENCE_TRANSFORMER_MODEL
INPUT_PATH = r"stsbenchmark"
BATCH_SIZE = 1024
OUT_DIM_DENSE = 128
NUM_EPOCHS = 2 ## THIS IS FIXED DO NOT CHANGE

# You are free to add your own hyperparameters as well.

In [38]:
parameters = {'n_estimators':60, 'max_depth': 5, 'min_samples_split':20}

## CONFIGURATION 1: Non-contextual Embeddings + ML Regression: 8 marks
1 Load the non-contextual embedding model in variable `non_cont_model1`. **1 marks**

2 Get feature for the sentences using the LM model loaded before. Add the code in the `get_feature_model1()` **2 marks**

2 Using features as X and score as Y, train a ML based regression model (`model1`). You are free to choose any sklearn based regression method, and its hyperparameters. **3.5 marks**

3 Print the correlation scores on the dev and test set predictions using trained `model1`. **1.5 mark**



In [39]:
def get_feature_model1(data_frame):
  """
  Input a data frame and return the embedding vectors for the each sentence column using non_cont_model1,
  Return 2 matrices each of shape (#_samples, #size_of_word_emb).
  """
  # print(data_frame['se'])
  a = []
  for i in data_frame['sent_a']:
    a.append(gensim.utils.simple_preprocess(i))
  # print(a)
  b = []
  for i in data_frame['sent_b']:
    b.append(gensim.utils.simple_preprocess(i))
    
  non_cont_model1.build_vocab(a)
  non_cont_model1.train(a, total_examples=non_cont_model1.corpus_count, epochs=NUM_EPOCHS, report_delay=1)
  # len(non_cont_model1.wv.vocab.keys())
  
  emb_a = []
  emb_b = []
  
  for i in range(len(a)):
    sentences_a = []
    sentences_b = []
    
    for j in range(len(a[i])):
      sentences_a.append(non_cont_model1.wv[a[i][j]])
      
    for k in range(len(a[i])):
      sentences_b.append(non_cont_model1.wv[a[i][k]])
      
    emb_a.append(sentences_a)
    emb_b.append(sentences_b)
    
    # print(emb_a)
  
  return emb_a,emb_b
  
def get_average_array(array_to_avg):
  ret_array = []

  for i in range(len(array_to_avg)):
    ret_array.append(np.average(array_to_avg[i]))

  return ret_array

non_cont_model1 = Word2Vec(min_count=1, window=2, workers=cores-1)


feature_1_train, feature_2_train = get_feature_model1(df_train)
feature_1_dev, feature_2_dev = get_feature_model1(df_dev)
feature_1_test, feature_2_test = get_feature_model1(df_test)

# print(feature_1_train[0])

feature_1_train_avg = get_average_array(feature_1_train)
feature_2_train_avg = get_average_array(feature_2_train)

feature_1_dev_avg = get_average_array(feature_1_dev)
feature_2_dev_avg = get_average_array(feature_2_dev)

feature_1_test_avg = get_average_array(feature_1_test)
feature_2_test_avg = get_average_array(feature_2_test)

X_train = np.column_stack((feature_1_train_avg,feature_2_train_avg))
Y_train = np.array(df_train['score'])

X_dev = np.column_stack((feature_1_dev_avg,feature_2_dev_avg))
Y_dev = df_dev.score

X_test = np.column_stack((feature_1_test_avg,feature_2_test_avg))
Y_test = df_test.score
# Initiate a regression model and train it.

reg = RandomForestRegressor()
reg.fit(X_train, Y_train)
print(reg.score(X_train, Y_train))

print(spearmanr(reg.predict(X_dev), Y_dev))
print(spearmanr(reg.predict(X_test), Y_test))


# print(reg)
# Print spearmanr correlation on the predicted output of the dev and test sets.
# print(feature_2_train)

0.5042613921948844
SpearmanrResult(correlation=-0.0586395598115972, pvalue=0.02313720013113114)
SpearmanrResult(correlation=-0.035959691124210845, pvalue=0.18201390445583188)


## CONFIGURATION 2: Contextual Embeddings + ML Regression: 7 marks
1 Load the contextual embedding model in variable `non_cont_model2`. **1 marks**

2 Get feature for the sentences using the LM model loaded before. Add the code in the `get_feature_model2()` **2 marks**

2 Using features as X and score as Y, train a ML based regression model (`model2`). You are free to choose any sklearn based regression method, and its hyperparameters. **3.5 marks**

3 Print the correlation scores on the dev and test set predictions using trained `model2`. **1.5 mark**

Useful references: https://www.sbert.net/docs/usage/semantic_textual_similarity.html

In [41]:
def get_feature_model2(data_frame):
  """
  Input a data frame and return the embedding vectors for the each sentence column using model2,
  Return 2 matrices each of shape (#_samples, #size_of_word_emb).
  """
  a = data_frame['sent_a']
  b = data_frame['sent_b']
  
  emb_a = non_cont_model2.encode(a)
  emb_b = non_cont_model2.encode(b)
  
  return emb_a, emb_b
  

non_cont_model2 = SentenceTransformer(HUGGING_FACE_SENTENCE_TRANSFORMER_MODEL)

feature_1_train2, feature_2_train2 = get_feature_model2(df_train)
feature_1_dev2, feature_2_dev2 = get_feature_model2(df_dev)
feature_1_test2, feature_2_test2 = get_feature_model2(df_test)

# feature_1_<dataset_type>, feature_2_<dataset_type> = get_feature_model2(data_frame)
# X_<dataset_type>, Y_<dataset_type> = 
# Initiate a regression model and train it.
# Print spearman correlation on the predicted output of the dev and test sets.
X_train2 = np.column_stack((feature_1_train2,feature_2_train2))
Y_train2 = np.array(df_train['score'])

X_dev2 = np.column_stack((feature_1_dev2,feature_2_dev2))
Y_dev2 = df_dev.score

X_test2 = np.column_stack((feature_1_test2,feature_2_test2))
Y_test2 = df_test.score

reg2 = RandomForestRegressor()
reg2.fit(X_train2, Y_train2)
print(reg2.score(X_train2, Y_train2))

print(spearmanr(reg2.predict(X_dev2), Y_dev2))
print(spearmanr(reg2.predict(X_test2), Y_test2))

0.8993502746932975
SpearmanrResult(correlation=0.6227549570137937, pvalue=7.759377489794083e-162)
SpearmanrResult(correlation=0.6044111676694474, pvalue=4.100804715813971e-138)


## CONFIGURATION 3: Fine-Tune a Contextual Embeddings Model: 18 marks
1 Prepare data samples to be for the DL model to consume. Add the code in the `form_data()`. **4 marks**

3 Create the data loader, one each for train/dev/test data_input sample set obtained from `form_input_example()`. **1.5 marks**

4 Initiate `model3` consisting of **atleast** the following 3 components - `base_LM`, a `pooling_layer` and a `dense_layer`. Use appropriate activation function in dense. **Atleast** one layer of `base_LM` should be set to trainable. **5 marks**

6 Initiate the `loss`. **0.5 marks**

7 Fit the `model3`. Use `NUM_EPOCHS = 2`. **MAX_NUM_EPOCHS allowed will be 3**. **2 marks** 

8 Complete the `get_model_predicts()` to obtain predicted scores for input sentence pairs. **3.5 marks** 

9 Print the correlation scores on the dev and test set predictions. **1.5 mark**

Useful References: https://huggingface.co/blog/how-to-train-sentence-transformers 

In [9]:
def form_data(data_frame):
  """
  Input a data frame and return the dataloder.
  """

def get_model_predicts(data_type, trained_model):
  """
  Input the dataset list and return a list of cosine similarity scores. Use the fitted final_trainable_model for obtaining encodings.
  """

# dataloader_<dataset_type> = form_data(data_frame)
# base_model = 
# layer_ppoling = 
# layer_dense = 
# model3 = 
# loss =

# Fit the model3.
# Print spearman correlation on the predicted output of the dev and test sets.