In [11]:
import numpy as np
import pandas as pd


# Load CSV files.
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

#CSV task A 
df_train_data_A = pd.read_csv("data/Training_Data/subtaskA_data_all.csv")
df_train_answers_A = pd.read_csv("data/Training_Data/subtaskA_answers_all.csv")

df_train_A = pd.merge(df_train_data_A,df_train_answers_A,on='id', how='left').drop(['id'], axis=1)
df_train_A.head()

df_test_data_A = pd.read_csv("data/Test_Data/subtaskA_test_data.csv")
df_test_answers_A = pd.read_csv("data/Test_Data/subtaskA_gold_answers.csv")

df_test_A = pd.merge(df_test_data_A,df_test_answers_A,on='id', how='left').drop(['id'], axis=1)
df_test_A.head()



Unnamed: 0,sent0,sent1,answer
0,He loves to stroll at the park with his bed,He loves to stroll at the park with his dog.,0
1,The inverter was able to power the continent.,The inverter was able to power the house,0
2,The chef put extra lemons on the pizza.,The chef put extra mushrooms on the pizza.,0
3,sugar is used to make coffee sour,sugar is used to make coffee sweet,0
4,There are beautiful flowers here and there in ...,There are beautiful planes here and there in t...,1


In [2]:
from podium import Vocab, Field, LabelField
from podium.datasets import TabularDataset
from podium.vectorizers import GloVe

In [12]:
max_vocab_size = 10_000
vocab = Vocab(max_size=max_vocab_size, min_freq=2)

def lowercase(raw):
    return raw.lower()

S0 = Field(name='sent0', numericalizer=vocab)
S1 = Field(name='sent1', numericalizer=vocab)
LABEL = LabelField(name='answer') # Label field


fields = [
    S0,
    S1,
    LABEL,
]

train = TabularDataset.from_pandas(df_train_A, fields)
test = TabularDataset.from_pandas(df_test_A, fields)
train.finalize_fields()





In [13]:
glove = GloVe()
# Load only the vectors of vocab words.
embeddings = glove.load_vocab(vocab)

# Generate padded batch.
train_batch = train.batch(add_padding=True)
test_batch = test.batch(add_padding=True)

100%|███████████████████████████████████████████████████████████████████████████████| 862M/862M [03:44<00:00, 3.84MB/s]


In [3]:
def cosine_similarity(a, b):
    return np.array([np.dot(a[i, :], b[i, :])/(np.linalg.norm(a[i, :])*np.linalg.norm(b[i, :])) for i in range(a.shape[0])])

In [15]:
def top_n(sims, n=10):
    index = np.argsort(sims)[-n:]
    return np.sort(index)
def meanVect(vector):
    return vector.mean(axis=1)       

In [28]:
sentence0_train, sentence1_train = embeddings[train_batch['sent0']], embeddings[train_batch['sent1']]
sentence0_test, sentence1_test = embeddings[test_batch['sent0']], embeddings[test_batch['sent1']]

label_train = train_batch['answer']
label_test = test_batch['answer']

sentence0_train_mean, sentence1_train_mean = meanVect(sentence0_train), meanVect(sentence1_train)
sentence0_test_mean, sentence1_test_mean = meanVect(sentence0_test), meanVect(sentence1_test)

X_train_mul, X_test_mul = np.multiply(sentence0_train_mean, sentence1_train_mean), np.multiply(sentence0_test_mean, sentence0_test_mean)
X_train_cat, X_test_cat = np.concatenate((sentence0_train_mean, sentence1_train_mean), axis=1), np.concatenate((sentence0_test_mean, sentence1_test_mean), axis=1)
y_train, y_test = label_train.reshape(label_train.shape[0],), label_test.reshape(label_test.shape[0],)
                                                                                                            

In [29]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression as LR


#return a logisitc regression model fi
def train_model(X_train, y_train):
    lr_kwargs={"max_iter": 1000, "solver": "lbfgs"}
    return LR(**lr_kwargs).fit(X_train, y_train)

def test_performance(model, x_test, y_test):
    type(model)
    y_pred = model.predict(x_test)
    print(classification_report(y_pred=y_pred, y_true=y_test))
    return f1_score(y_pred=y_pred, y_true=y_test, average="binary")

In [30]:
print("Multiplied Representation: ")
lr = train_model(X_train_mul, y_train)


Multiplied Representation: 


In [31]:
f1 = test_performance(lr, X_test_mul, y_test)
print(f"f1 = {f1:.3f}") #found f1=0.439


              precision    recall  f1-score   support

           0       0.47      0.54      0.50       492
           1       0.48      0.41      0.44       508

    accuracy                           0.47      1000
   macro avg       0.47      0.48      0.47      1000
weighted avg       0.47      0.47      0.47      1000

f1 = 0.439


In [32]:
print("Concatenated Representation: ")
lr = train_model(X_train_cat, y_train)
f1 = test_performance(lr, X_test_cat, y_test)
print(f"f1 = {f1:.3f}") #found f1=0.586

Concatenated Representation: 
              precision    recall  f1-score   support

           0       0.57      0.54      0.55       492
           1       0.57      0.60      0.59       508

    accuracy                           0.57      1000
   macro avg       0.57      0.57      0.57      1000
weighted avg       0.57      0.57      0.57      1000

f1 = 0.586
