Source code of every test for the task A with a BERT model

In [1]:
import numpy as np
import pandas as pd

# Load CSV files.
# CSV task B
def getData():
    df_train_data = pd.read_csv("data/Training_Data/subtaskB_data_all.csv")
    df_train_answers = pd.read_csv("data/Training_Data/subtaskB_answers_all.csv")

    df_train = pd.merge(df_train_data,df_train_answers,on='id', how='left').drop(['id'], axis=1)
    
    df_dev_data = pd.read_csv("data/Dev_Data/subtaskB_dev_data.csv")
    df_dev_answers = pd.read_csv("data/Dev_Data/subtaskB_gold_answers.csv")

    df_dev = pd.merge(df_dev_data,df_dev_answers,on='id', how='left').drop(['id'], axis=1)

    df_test_data = pd.read_csv("data/Test_Data/subtaskB_test_data.csv")
    df_test_answers = pd.read_csv("data/Test_Data/subtaskB_gold_answers.csv")

    df_test= pd.merge(df_test_data,df_test_answers,on='id', how='left').drop(['id'], axis=1)
    
    return df_train, df_dev, df_test

df_train_B, df_dev_B, df_test_B = getData()

df_train_B.head()

Unnamed: 0,FalseSent,OptionA,OptionB,OptionC,answer
0,He poured orange juice on his cereal.,Orange juice is usually bright orange.,Orange juice doesn't taste good on cereal.,Orange juice is sticky if you spill it on the ...,B
1,He drinks apple.,Apple juice are very tasty and milk too,Apple can not be drunk,Apple cannot eat a human,B
2,"Jeff ran 100,000 miles today","100,000 miles is way to long for one person to...","Jeff is a four letter name and 100,000 has six...","100,000 miles is longer than 100,000 km.",A
3,I sting a mosquito,A human is a mammal,A human is omnivorous,A human has not stings,C
4,A giraffe is a person.,Giraffes can drink water from a lake.,A giraffe is not a human being.,.Giraffes usually eat leaves.,B


In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")
import nltk
from nltk.stem.porter import *

stemmer = PorterStemmer()

Methods to pre-process the dataframe

In [3]:
def lemmatizer(text):
    """
    Receives a string as an input and lemmatizes it.
    """
    str = ""
    doc = nlp(text)
    for token in doc:
        str+=" "+token.lemma_
    return str 


def stemmatizer(text) :
    """
    Receive a string in input and stem it.
    """
    str = ""
    doc = nlp(text)
    for token in doc :
        str += ""+stemmer.stem(token.text)
    return str

def removeStopWords(text):
    """
    Receives a string and remove stop words from it.
    """
    str = ""
    doc = nlp(text)
    for token in doc:
        if(not token.is_stop):
            str+=" "+token.text
    return str 

def ngrams(text, n):
    """
    Receives a text of tokens and generates n-grams.
    """
    sequence=[]
    str = ""
    doc = nlp(text)
    for token in doc :
        sequence.append(token.text)
    return list(tuple([sequence[i] for i in range(i, i+n)]) for i in range(len(sequence)-n+1))


In [4]:
from nltk.corpus import wordnet as wn

In [5]:
def synsets(text, function):
    """
    Receives a text and return a list of synsets
    """
    text = text.replace("."," .")
    sequence=text.split()
    L = []
    for seq in sequence:
        L.append(tuple(function(s) for s in wn.synsets(seq)))
    return L

def getLemmasNames(synset):
    return [str(lemma.name()) for lemma in synset.lemmas()]

def getHypernyms(synset):
    return [s.name().split(".")[0] for s in synset.hypernyms() ]

In [6]:
def pre_process(df, function):
    newdf = df[['FalseSent', 'OptionA', 'OptionB', 'OptionC']]
    newdf.loc[:,"FalseSent"] = df.FalseSent.apply(function)
    newdf.loc[:,"OptionA"] = df.OptionA.apply(function)
    newdf.loc[:,"OptionB"] = df.OptionB.apply(function)
    newdf.loc[:,"OptionC"] = df.OptionC.apply(function)
    return newdf

Process of data frame, create subsample of it

In [7]:
def subsampleData():
    # subsample data 
    train = df_train_B.sample(n=1000, random_state=42)

    X_train = train[['FalseSent', 'OptionA', 'OptionB', 'OptionC']]
    y_train = train['answer']

    # use the dev set for testing  
    return X_train, y_train

X_test = df_dev_B[['FalseSent', 'OptionA', 'OptionB', 'OptionC']]
y_test = df_dev_B['answer']

In [8]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
def test_performance(model, x_test, y_test):
    y_pred = model.predict(x_test)
    print(classification_report(y_pred=y_pred, y_true=y_test))
    return f1_score(y_pred=y_pred, y_true=y_test, average="macro"), f1_score(y_pred=y_pred, y_true=y_test, average="micro")

Importation of the BERT model

In [9]:
from transformers import BertModel
from bert_sklearn import BertClassifier

In [10]:
model = BertClassifier(max_seq_length=64, train_batch_size=16)
#model.num_mlp_layers = 3
model.max_seq_length = 64
model.epochs = 3
#model.learning_rate = 4e-5
                             
model

Building sklearn text classifier...


In [11]:
X_train_sample, y_train= subsampleData()

Fit with different preprocess type                                                                

In [10]:
X_train_sample.head()

Unnamed: 0,FalseSent,OptionA,OptionB,OptionC
6252,a duck walks on three legs,the duck's legs are short,a duck has only two legs,a duck has two wings
4684,Jack's mom praised him because he broke the plate,"Breaking a plate is not a good thing, people w...","Plates are easy to break, people need to be ca...",Plates can be made of ceramic or plastic
1731,People use electricity to buy things,It is impossible to buy things with electricity,Electricity is essential to live,Many appliances in home works on electricity
4742,"The display is damaged, thus I can't hear anyt...",Display can also be used to create sound with ...,Display cannot be used to aid people's hearing,"Display is used to display things, people will..."
4521,Santa Claus is the legend of the East,Christmas is very grand in the West,The origin of Christmas is not in the East,Western countries are very respectful of Santa...


In [11]:
model_classic = model.fit(X_train_sample, y_train)
f1macro, f1micro = test_performance(model_classic, X_test, y_test)
print(f"f1micro = {f1micro:.3f} and "f"f1macro = {f1macro:.3f}")

Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 900, validation data size: 100


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\utils\python_arg_parser.cpp:1485.)
  next_m.mul_(beta1).add_(1 - beta1, grad)
Training  : 100%|███████████████████████████████████████████████████████████| 57/57 [06:59<00:00,  7.37s/it, loss=1.07]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 13/13 [00:36<00:00,  2.84s/it]

Epoch 1, Train loss: 1.0667, Val loss: 0.9957, Val accy: 53.00%



Training  : 100%|██████████████████████████████████████████████████████████| 57/57 [07:02<00:00,  7.41s/it, loss=0.855]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 13/13 [00:35<00:00,  2.73s/it]

Epoch 2, Train loss: 0.8551, Val loss: 0.9771, Val accy: 55.00%



Training  : 100%|██████████████████████████████████████████████████████████| 57/57 [06:57<00:00,  7.33s/it, loss=0.592]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 13/13 [00:35<00:00,  2.71s/it]

Epoch 3, Train loss: 0.5921, Val loss: 1.0101, Val accy: 56.00%



Predicting: 100%|████████████████████████████████████████████████████████████████████| 125/125 [02:37<00:00,  1.26s/it]

              precision    recall  f1-score   support

           A       0.67      0.65      0.66       344
           B       0.38      0.28      0.32       327
           C       0.43      0.56      0.48       326

    accuracy                           0.50       997
   macro avg       0.49      0.49      0.49       997
weighted avg       0.49      0.50      0.49       997

f1micro = 0.496 and f1macro = 0.488





With lemma

In [14]:
from functools import partial

In [20]:
pipe_fn = partial(synsets, function=getLemmasNames)
X_train = pre_process(X_train_sample,pipe_fn)
X_test_lemma = pre_process(X_test,pipe_fn)

model_lemma = model.fit(X_train, y_train)

f1macro, f1micro = test_performance(model_lemma, X_test, y_test)
print(f"f1micro = {f1micro:.3f} and "f"f1macro = {f1macro:.3f}")
f1macro, f1micro = test_performance(model_lemma, X_test_lemma, y_test)
print(f"f1micro = {f1micro:.3f} and "f"f1macro = {f1macro:.3f}")

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x00000266B0AF6E50>
Traceback (most recent call last):
  File "C:\Users\ellyn_vdxio7o\miniconda3\envs\tarProject\lib\site-packages\torch\utils\data\dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "C:\Users\ellyn_vdxio7o\miniconda3\envs\tarProject\lib\site-packages\torch\utils\data\dataloader.py", line 1436, in _shutdown_workers
    if self._persistent_workers or self._workers_status[worker_id]:
AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status'


Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 900, validation data size: 100


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\utils\python_arg_parser.cpp:1485.)
  next_m.mul_(beta1).add_(1 - beta1, grad)
Training  : 100%|███████████████████████████████████████████████████████████| 57/57 [08:03<00:00,  8.48s/it, loss=1.12]
Validating: 100%|██████████████████████████████████████████████████████████████████████| 13/13 [00:41<00:00,  3.22s/it]

Epoch 1, Train loss: 1.1193, Val loss: 1.0992, Val accy: 31.00%



Training  :  74%|████████████████████████████████████████████▏               | 42/57 [06:30<02:03,  8.25s/it, loss=1.1]

Remove hypernyms

In [None]:
pipe_fn = partial(synsets, function=getHypernyms)
X_train = pre_process(X_train_sample,pipe_fn)
X_test_hypernyms = pre_process(X_test,pipe_fn)

model_hypernyms = model.fit(X_train, y_train)

f1macro, f1micro = test_performance(model_hypernyms, X_test, y_test)
print(f"f1micro = {f1micro:.3f} and "f"f1macro = {f1macro:.3f}")
f1macro, f1micro = test_performance(model_hypernyms, X_test_hypernyms, y_test)
print(f"f1micro = {f1micro:.3f} and "f"f1macro = {f1macro:.3f}")

To save a model

In [None]:
#save model to disk
savefile = 'BERT_TaskB.bin'
model.save(savefile)