Source code of every test for the task A with a BERT model

In [1]:
import numpy as np
import pandas as pd

# Load CSV files.
#CSV task A 
def getData():
    df_train_data = pd.read_csv("data/Training_Data/subtaskA_data_all.csv")
    df_train_answers = pd.read_csv("data/Training_Data/subtaskA_answers_all.csv")

    df_train = pd.merge(df_train_data,df_train_answers,on='id', how='left').drop(['id'], axis=1)
    
    df_dev_data = pd.read_csv("data/Dev_Data/subtaskA_dev_data.csv")
    df_dev_answers = pd.read_csv("data/Dev_Data/subtaskA_gold_answers.csv")

    df_dev = pd.merge(df_dev_data,df_dev_answers,on='id', how='left').drop(['id'], axis=1)

    df_test_data = pd.read_csv("data/Test_Data/subtaskA_test_data.csv")
    df_test_answers = pd.read_csv("data/Test_Data/subtaskA_gold_answers.csv")

    df_test= pd.merge(df_test_data,df_test_answers,on='id', how='left').drop(['id'], axis=1)
    
    return df_train, df_dev, df_test

df_train_A, df_dev_A, df_test_A = getData()



In [2]:
import spacy
import nltk
from nltk.stem.porter import *

stemmer = PorterStemmer()
nlp = spacy.load("en_core_web_sm")

Methods to pre-process the dataframe

In [3]:
def lemmatizer(text):
    """
    Receives a string as an input and lemmatizes it.
    """
    str = ""
    doc = nlp(text)
    for token in doc:
        str+=" "+token.lemma_
    return str 

def stemmatizer(text) :
    """
    Receive a string in input and stem it.
    """
    str = ""
    doc = nlp(text)
    for token in doc :
        str += " "+stemmer.stem(token.text)
    return str
    
def ngrams(text, n):
    """
    Receives a text and generates n-grams.
    """
    sequence=[]
    str = ""
    doc = nlp(text)
    for token in doc :
        sequence.append(token.text)
    return list(tuple([sequence[i] for i in range(i, i+n)]) for i in range(len(sequence)-n+1)) 
    
def removeStopWords(text):
    """
    Receives a string and remove stop words from it.
    """
    str = ""
    doc = nlp(text)
    for token in doc:
        if(not token.is_stop):
            str+=" "+token.text
    return str 



In [4]:
def pre_process(df, function):
    newdf = df[['sent0', 'sent1']]
    newdf.loc[:,"sent0"] = df.sent0.apply(function)
    newdf.loc[:,"sent1"] = df.sent1.apply(function)
    return newdf

Process of data frame, create subsample of it

In [5]:
def subsampleData():
    # subsample data 
    train = df_train_A.sample(n=1000, random_state=42)
    X_train = train[['sent0', 'sent1']]
    y_train = train['answer']

    return X_train, y_train

# use the dev set for testing
X_test = df_dev_A[['sent0', 'sent1']]
y_test = df_dev_A['answer']

Importation of the BERT model

In [6]:
from transformers import BertModel
from bert_sklearn import BertClassifier

In [7]:
model = BertClassifier(max_seq_length=64, train_batch_size=16)
#model.num_mlp_layers = 3
model.max_seq_length = 64
model.epochs = 3
#model.learning_rate = 4e-5
                             
model

Building sklearn text classifier...


In [8]:
X_train_sample, y_train = subsampleData()

Fit with different preprocess type                                                                

In [9]:
X_train_sample.head()

Unnamed: 0,sent0,sent1
6252,a duck walks on three legs,a duck walks on two legs
4684,Jack's mom praised him because he broke the plate,Jack's mom condemned him because he broke the ...
1731,People use electricity to buy things,People use money to buy things
4742,"The speaker is damaged, thus I can't hear anyt...","The display is damaged, thus I can't hear anyt..."
4521,Santa Claus is the legend of the East,Santa Claus is the legend of the West


In [11]:
model_classic = model.fit(X_train_sample, y_train)

Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 900, validation data size: 100


Training  :   0%|                                                                               | 0/57 [00:28<?, ?it/s]


KeyboardInterrupt: 

With only lemma

In [19]:
X_train = pre_process(X_train_sample,lemmatizer)
X_train.head()

X_train_stem = pre_process(X_train_sample, stemmatizer)
X_train_stem.head()

X_test_stem = pre_process(X_test, stemmatizer)
X_test_stem.head()

X_test_lemma = pre_process(X_test,lemmatizer)
X_test_lemma.head()

Unnamed: 0,sent0,sent1
0,"summer in North America be great for skiing ,...",summer in North America be great for swimming...
1,you can use detergent to dye your hair .,you can use bleach to dye your hair .
2,pass your drive license exam require study fo...,pass your university exam require study for y...
3,the hanger buy the closet,the closet get hanger
4,coffee take sleep,coffee depress people


In [20]:
model_lemma = model.fit(X_train, y_train)

Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 900, validation data size: 100


Training  : 100%|██████████████████████████████████████████████████████████| 57/57 [07:26<00:00,  7.83s/it, loss=0.733]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 13/13 [00:55<00:00,  4.24s/it]

Epoch 1, Train loss: 0.7330, Val loss: 0.7202, Val accy: 45.00%



Training  : 100%|██████████████████████████████████████████████████████████| 57/57 [07:21<00:00,  7.74s/it, loss=0.696]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 13/13 [00:30<00:00,  2.32s/it]

Epoch 2, Train loss: 0.6955, Val loss: 0.7025, Val accy: 46.00%



Training  : 100%|██████████████████████████████████████████████████████████| 57/57 [07:20<00:00,  7.73s/it, loss=0.664]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 13/13 [00:30<00:00,  2.33s/it]

Epoch 3, Train loss: 0.6645, Val loss: 0.6965, Val accy: 50.00%





Remove stop words

In [12]:
model_stem = model.fit(X_train_stem, y_train)

Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 900, validation data size: 100


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\utils\python_arg_parser.cpp:1485.)
  next_m.mul_(beta1).add_(1 - beta1, grad)
Training  : 100%|██████████████████████████████████████████████████████████| 57/57 [08:03<00:00,  8.48s/it, loss=0.732]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 13/13 [00:55<00:00,  4.26s/it]

Epoch 1, Train loss: 0.7324, Val loss: 0.7294, Val accy: 45.00%



Training  : 100%|██████████████████████████████████████████████████████████| 57/57 [08:01<00:00,  8.45s/it, loss=0.694]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 13/13 [01:02<00:00,  4.84s/it]

Epoch 2, Train loss: 0.6944, Val loss: 0.6955, Val accy: 48.00%



Training  : 100%|██████████████████████████████████████████████████████████| 57/57 [07:42<00:00,  8.12s/it, loss=0.658]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 13/13 [00:49<00:00,  3.83s/it]

Epoch 3, Train loss: 0.6579, Val loss: 0.6786, Val accy: 49.00%





In [9]:
from functools import partial

pipe_fn = partial(ngrams, n=2)
X_train_bigram = pre_process(X_train_sample,removeStopWords)
X_train_bigram.head()
X_test_bigram = pre_process(X_test,removeStopWords)
X_test_bigram.head()

Unnamed: 0,sent0,sent1
0,"Summer North America great skiing , snowsho...","Summer North America great swimming , boati..."
1,use detergent dye hair .,use bleach dye hair .
2,passing driving license exams requires studyi...,passing university exams requires studying cl...
3,hangers bought closet,closet got hangers
4,coffee takes sleep,coffee depresses people


In [14]:
model_bigram = model.fit(X_train_bigram, y_train)
f1binary = test_performance(model_bigram, X_test_bigram, y_test)
print(f"f1binary = {f1binary:.3f}")
f1binary = test_performance(model_bigram, X_test, y_test)
print(f"f1binary = {f1binary:.3f}")

Predicting: 100%|████████████████████████████████████████████████████████████████████| 125/125 [02:22<00:00,  1.14s/it]


              precision    recall  f1-score   support

           0       0.56      0.24      0.33       518
           1       0.49      0.80      0.61       479

    accuracy                           0.51       997
   macro avg       0.52      0.52      0.47       997
weighted avg       0.53      0.51      0.46       997

f1binary = 0.608


Predicting: 100%|████████████████████████████████████████████████████████████████████| 125/125 [02:24<00:00,  1.15s/it]

              precision    recall  f1-score   support

           0       0.64      0.01      0.03       518
           1       0.48      0.99      0.65       479

    accuracy                           0.48       997
   macro avg       0.56      0.50      0.34       997
weighted avg       0.56      0.48      0.33       997

f1binary = 0.648





In [15]:
X_train_trigram = pre_process(X_train_sample,removeStopWords)
X_train_trigram.head()
X_test_trigram = pre_process(X_test,removeStopWords)
X_test_trigram.head()

Unnamed: 0,sent0,sent1
0,"Summer North America great skiing , snowsho...","Summer North America great swimming , boati..."
1,use detergent dye hair .,use bleach dye hair .
2,passing driving license exams requires studyi...,passing university exams requires studying cl...
3,hangers bought closet,closet got hangers
4,coffee takes sleep,coffee depresses people


In [16]:
model_trigram = model.fit(X_train_trigram, y_train)
f1binary = test_performance(model_trigram, X_test_trigram, y_test)
print(f"f1binary = {f1binary:.3f}")
f1binary = test_performance(model_trigram, X_test, y_test)
print(f"f1binary = {f1binary:.3f}")

Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 900, validation data size: 100


Training  : 100%|██████████████████████████████████████████████████████████| 57/57 [07:39<00:00,  8.05s/it, loss=0.731]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 13/13 [00:52<00:00,  4.05s/it]

Epoch 1, Train loss: 0.7309, Val loss: 0.7355, Val accy: 45.00%



Training  : 100%|██████████████████████████████████████████████████████████| 57/57 [07:16<00:00,  7.65s/it, loss=0.698]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 13/13 [00:30<00:00,  2.32s/it]

Epoch 2, Train loss: 0.6975, Val loss: 0.7010, Val accy: 45.00%



Training  : 100%|██████████████████████████████████████████████████████████| 57/57 [07:21<00:00,  7.74s/it, loss=0.675]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 13/13 [00:31<00:00,  2.40s/it]

Epoch 3, Train loss: 0.6753, Val loss: 0.6932, Val accy: 54.00%



Predicting: 100%|████████████████████████████████████████████████████████████████████| 125/125 [02:33<00:00,  1.23s/it]


              precision    recall  f1-score   support

           0       0.56      0.24      0.33       518
           1       0.49      0.80      0.61       479

    accuracy                           0.51       997
   macro avg       0.52      0.52      0.47       997
weighted avg       0.53      0.51      0.46       997

f1binary = 0.608


Predicting: 100%|████████████████████████████████████████████████████████████████████| 125/125 [02:36<00:00,  1.25s/it]


              precision    recall  f1-score   support

           0       0.64      0.01      0.03       518
           1       0.48      0.99      0.65       479

    accuracy                           0.48       997
   macro avg       0.56      0.50      0.34       997
weighted avg       0.56      0.48      0.33       997

f1binary = 0.648


In [26]:
X_train = pre_process(X_train_sample,removeStopWords)
X_train.head()
X_train_stop_stem = pre_process(X_train, stemmatizer)
X_train_stop_stem.head()

X_test_stopWord = pre_process(X_test,removeStopWords)
X_test_stopWord.head()

Unnamed: 0,sent0,sent1
0,"Summer North America great skiing , snowsho...","Summer North America great swimming , boati..."
1,use detergent dye hair .,use bleach dye hair .
2,passing driving license exams requires studyi...,passing university exams requires studying cl...
3,hangers bought closet,closet got hangers
4,coffee takes sleep,coffee depresses people


In [24]:
model_stopWords = model.fit(X_train, y_train)

Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 900, validation data size: 100


Training  : 100%|██████████████████████████████████████████████████████████| 57/57 [07:07<00:00,  7.50s/it, loss=0.731]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 13/13 [00:31<00:00,  2.39s/it]

Epoch 1, Train loss: 0.7309, Val loss: 0.7355, Val accy: 45.00%



Training  : 100%|██████████████████████████████████████████████████████████| 57/57 [07:16<00:00,  7.67s/it, loss=0.698]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 13/13 [00:29<00:00,  2.24s/it]

Epoch 2, Train loss: 0.6975, Val loss: 0.7010, Val accy: 45.00%



Training  : 100%|██████████████████████████████████████████████████████████| 57/57 [07:11<00:00,  7.58s/it, loss=0.675]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 13/13 [00:29<00:00,  2.27s/it]

Epoch 3, Train loss: 0.6753, Val loss: 0.6932, Val accy: 54.00%





Score of models

In [12]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [13]:
def test_performance(model, x_test, y_test):
    y_pred = model.predict(x_test)
    print(classification_report(y_pred=y_pred, y_true=y_test))
    return f1_score(y_pred=y_pred, y_true=y_test, average="binary")

In [21]:
model_cased_base = BertClassifier(max_seq_length=64, train_batch_size=16)
model_cased_base.bert_model = 'bert-base-cased'
model_cased_base.max_seq_length = 64
model_cased_base.epochs = 3

model_cased_base

Building sklearn text classifier...


In [22]:
model_cased = model_cased_base.fit(X_train_stem, y_train)


Loading bert-base-cased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 900, validation data size: 100


Training  : 100%|██████████████████████████████████████████████████████████| 57/57 [08:05<00:00,  8.52s/it, loss=0.704]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 13/13 [00:49<00:00,  3.82s/it]


Epoch 1, Train loss: 0.7041, Val loss: 0.6993, Val accy: 47.00%


Training  : 100%|██████████████████████████████████████████████████████████| 57/57 [07:18<00:00,  7.69s/it, loss=0.701]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 13/13 [00:31<00:00,  2.46s/it]

Epoch 2, Train loss: 0.7009, Val loss: 0.6997, Val accy: 47.00%



Training  : 100%|██████████████████████████████████████████████████████████| 57/57 [07:20<00:00,  7.73s/it, loss=0.699]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 13/13 [00:30<00:00,  2.38s/it]

Epoch 3, Train loss: 0.6988, Val loss: 0.6980, Val accy: 47.00%





In [23]:
f1binary = test_performance(model_cased, X_test, y_test)
print(f"f1binary = {f1binary:.3f}")

Predicting: 100%|████████████████████████████████████████████████████████████████████| 125/125 [02:34<00:00,  1.24s/it]


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       518
           1       0.48      1.00      0.65       479

    accuracy                           0.48       997
   macro avg       0.24      0.50      0.32       997
weighted avg       0.23      0.48      0.31       997

f1binary = 0.649


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
f1micro, f1macro = test_performance(model_classic, X_test, y_test)
print(f"f1micro = {f1micro:.3f} and "f"f1macro = {f1macro:.3f}")

NameError: name 'model_classic' is not defined

In [18]:
f1binary = test_performance(model_stem, X_test, y_test)
print(f"f1binary = {f1binary:.3f}")

Predicting: 100%|████████████████████████████████████████████████████████████████████| 125/125 [02:14<00:00,  1.08s/it]

              precision    recall  f1-score   support

           0       0.54      0.54      0.54       518
           1       0.51      0.51      0.51       479

    accuracy                           0.53       997
   macro avg       0.53      0.53      0.53       997
weighted avg       0.53      0.53      0.53       997

f1binary = 0.507





In [22]:
f1binary = test_performance(model_lemma, X_test_lemma, y_test)
print(f"f1binary = {f1binary:.3f}")

Predicting: 100%|████████████████████████████████████████████████████████████████████| 125/125 [02:25<00:00,  1.16s/it]

              precision    recall  f1-score   support

           0       0.50      0.37      0.43       518
           1       0.47      0.60      0.53       479

    accuracy                           0.48       997
   macro avg       0.49      0.49      0.48       997
weighted avg       0.49      0.48      0.48       997

f1binary = 0.528





In [27]:
f1binary = test_performance(model_stopWords, X_test_stopWord, y_test)
print(f"f1binary = {f1binary:.3f}")

Predicting: 100%|████████████████████████████████████████████████████████████████████| 125/125 [02:34<00:00,  1.24s/it]

              precision    recall  f1-score   support

           0       0.56      0.24      0.33       518
           1       0.49      0.80      0.61       479

    accuracy                           0.51       997
   macro avg       0.52      0.52      0.47       997
weighted avg       0.53      0.51      0.46       997

f1binary = 0.608





To save a model

In [None]:
#save model to disk
savefile = 'BERT_TaskA.bin'
model.save(savefile)