In [1]:
import numpy as np
import pandas as pd

# Load CSV files.
#CSV task A 
def getData():
    df_train_data = pd.read_csv("data/Training_Data/subtaskA_data_all.csv")
    df_train_answers = pd.read_csv("data/Training_Data/subtaskA_answers_all.csv")

    df_train = pd.merge(df_train_data,df_train_answers,on='id', how='left').drop(['id'], axis=1)
    
    df_dev_data = pd.read_csv("data/Dev_Data/subtaskA_dev_data.csv")
    df_dev_answers = pd.read_csv("data/Dev_Data/subtaskA_gold_answers.csv")

    df_dev = pd.merge(df_dev_data,df_dev_answers,on='id', how='left').drop(['id'], axis=1)

    df_test_data = pd.read_csv("data/Test_Data/subtaskA_test_data.csv")
    df_test_answers = pd.read_csv("data/Test_Data/subtaskA_gold_answers.csv")

    df_test= pd.merge(df_test_data,df_test_answers,on='id', how='left').drop(['id'], axis=1)
    
    return df_train, df_dev, df_test

df_train_A, df_dev_A, df_test_A = getData()



In [2]:
from transformers import BertModel
from bert_sklearn import BertClassifier
from bert_sklearn import BertRegressor
from bert_sklearn import BertTokenClassifier
from bert_sklearn import load_model

In [3]:
model = BertClassifier(max_seq_length=64, train_batch_size=16)
model

Building sklearn text classifier...


In [4]:
def cosine_similarity(a, b):
    return np.array([np.dot(a[i, :], b[i, :])/(np.linalg.norm(a[i, :])*np.linalg.norm(b[i, :])) for i in range(a.shape[0])])

In [5]:
def top_n(sims, n=10):
    index = np.argsort(sims)[-n:]
    return np.sort(index)
def meanVect(vector):
    return vector.mean(axis=1)       

In [26]:
# subsample data
#take n% of data set
n=0.2
train = df_train_A.sample(frac=0.2, replace=True)

X_train = train[['sent0', 'sent1']]
y_train = train['answer']

# use the dev set for testing

X_test = df_dev_A[['sent0', 'sent1']]
y_test = df_dev_A['answer']

                                                                                                            

In [27]:

model = model.fit(X_train, y_train)

Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 1800, validation data size: 200


Training  : 100%|████████████████████████████████████████████████████████| 113/113 [15:02<00:00,  7.98s/it, loss=0.719]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 25/25 [00:55<00:00,  2.22s/it]

Epoch 1, Train loss: 0.7194, Val loss: 0.6766, Val accy: 58.50%



Training  : 100%|████████████████████████████████████████████████████████| 113/113 [14:34<00:00,  7.74s/it, loss=0.571]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 25/25 [00:49<00:00,  1.99s/it]

Epoch 2, Train loss: 0.5709, Val loss: 0.6165, Val accy: 66.00%



Training  : 100%|████████████████████████████████████████████████████████| 113/113 [14:05<00:00,  7.48s/it, loss=0.209]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 25/25 [00:51<00:00,  2.07s/it]

Epoch 3, Train loss: 0.2091, Val loss: 0.6715, Val accy: 72.50%





In [28]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report



def test_performance(model, x_test, y_test):
    type(model)
    y_pred = model.predict(x_test)
    print(classification_report(y_pred=y_pred, y_true=y_test))
    return f1_score(y_pred=y_pred, y_true=y_test, average="binary")

In [29]:
f1 = test_performance(model, X_test, y_test)
print(f"f1 = {f1:.3f}")


Predicting: 100%|████████████████████████████████████████████████████████████████████| 125/125 [02:52<00:00,  1.38s/it]

              precision    recall  f1-score   support

           0       0.77      0.56      0.65       518
           1       0.63      0.82      0.72       479

    accuracy                           0.69       997
   macro avg       0.70      0.69      0.68       997
weighted avg       0.71      0.69      0.68       997

f1 = 0.715





In [12]:
#save model to disk
savefile = 'BERT-v1.bin'
model.save(savefile)



In [None]:
# load model from disk
new_model = load_model(savefile)

# predict with new model
accy = new_model.score(X_test, y_test)