In [2]:
from tqdm.notebook import tqdm

import torch
from torch.utils.data import Dataset, DataLoader


from transformers import AutoTokenizer, AutoModel

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [3]:
import pandas as pd

df = pd.read_csv('/content/CommitmentBank-items.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,uID,Verb,Embedding,Context,Target,Prompt,ModalType,MatTense,MatSubjLemma,MatSubjPer,MatSubjNum,Embedding.1,genre,factive,mean.noTarget,sd.noTarget,Mean,SD,Reponses
0,1,BNC-1,admit,conditional,Polly had to think quickly.,They were still close enough to shore for him ...,Polly was not an experienced ocean sailor,,future,she,third,singular,conditional,BNC,no,-0.111111,1.269296,2.0,0.866025,"2, 3, 3, 1, 3, 1, 2, 1, 2"
1,2,BNC-1002,say,modal,"Nevertheless, life went on as it always does. ...",Indeed it could be said that they had prospered.,the Kiwi Keith and the Mackenzie houses had pr...,CI,present,it,third,singular,modal,BNC,no,1.2,1.873796,1.875,0.834523,"2, 1, 3, 1, 2, 2, 3, 1"
2,3,BNC-1003,say,modal,"He patted her hand. If he had chosen to, Thoma...",He might have said to her that some time in th...,some time in the middle of the nineteenth cent...,AB,present,he,third,singular,modal,BNC,no,-0.25,0.46291,0.666667,1.154701,"2, 0, 1, 0, 0, 3, -1, 0, 1, 0, 0, 2"
3,4,BNC-1005,say,modal,She could see his distorted image in the slant...,Of course she could say it was for the childre...,it was for the children,AB,future,she,third,singular,modal,BNC,no,0.636364,1.120065,0.875,0.991031,"2, 1, 0, 1, -1, 1, 1, 2"
4,5,BNC-1006,say,modal,"She glanced around the room, laying her hot fl...",Robyn swallowed and took a deep breath trying ...,it was all right,AB,future,she,third,singular,modal,BNC,no,-1.090909,1.044466,0.0,2.309401,"3, 1, -2, -3, -3, 3, 2, 1, -1, -1"


In [5]:
df.loc[:, 'nli'] = 'neutral'
df.loc[df.Mean <= -1, 'nli'] = 'contradiction'
df.loc[df.Mean >= 1, 'nli'] = 'entailment'

In [6]:
df['Content'] = '[CLS] ' + df['Target'] + ' [SEP] ' + df['Prompt'] + ' [SEP]'

In [7]:
model = AutoModel.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [None]:
device_cuda = torch.device("cuda")
model = model.to(device_cuda)

In [9]:
def vectorize_dataset_and_return(data):
    res = []
    for batch in tqdm(data):
        toks = tokenizer(batch[0], padding='max_length', truncation=True, return_tensors='pt',
          max_length=64)
        with torch.no_grad():
            model_output = model(**{k: v.to(model.device) for k, v in toks.items()})
        res.append(torch.mean(model_output.last_hidden_state, dim=1))
    res = torch.vstack(res)
    return res.detach().cpu().numpy()

In [10]:
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __getitem__(self, index):
        row = self.dataframe.iloc[index].to_numpy()
        features = row[1]
        label = row[0]
        return features, label

    def __len__(self):
        return len(self.dataframe)

In [11]:
data = CustomDataset(dataframe=df[['nli', 'Content']])

In [12]:
dl = DataLoader(data, batch_size=256, shuffle=False,
                drop_last=False)
x = vectorize_dataset_and_return(dl)

  0%|          | 0/5 [00:00<?, ?it/s]

In [14]:
le = LabelEncoder()
y = le.fit_transform(df['nli'])

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [16]:
clf = MLPClassifier(random_state=1, max_iter=300, verbose=1)
clf.fit(x_train, y_train)

Iteration 1, loss = 1.06894858
Iteration 2, loss = 0.93872063
Iteration 3, loss = 0.89908333
Iteration 4, loss = 0.88017192
Iteration 5, loss = 0.89069772
Iteration 6, loss = 0.82934504
Iteration 7, loss = 0.83720051
Iteration 8, loss = 0.79555826
Iteration 9, loss = 0.83834780
Iteration 10, loss = 0.76441054
Iteration 11, loss = 0.80113688
Iteration 12, loss = 0.77691009
Iteration 13, loss = 0.75188756
Iteration 14, loss = 0.72574576
Iteration 15, loss = 0.72135279
Iteration 16, loss = 0.70353423
Iteration 17, loss = 0.70872641
Iteration 18, loss = 0.69312706
Iteration 19, loss = 0.70438737
Iteration 20, loss = 0.66631260
Iteration 21, loss = 0.68399344
Iteration 22, loss = 0.65926952
Iteration 23, loss = 0.65028179
Iteration 24, loss = 0.63730560
Iteration 25, loss = 0.64620978
Iteration 26, loss = 0.63201051
Iteration 27, loss = 0.61276159
Iteration 28, loss = 0.61516901
Iteration 29, loss = 0.60885496
Iteration 30, loss = 0.59651635
Iteration 31, loss = 0.58317207
Iteration 32, los

In [17]:
train_pred = clf.predict(x_train)
test_pred = clf.predict(x_test)

In [18]:
crep = classification_report(y_train, train_pred, target_names=le.classes_)
print(crep)

               precision    recall  f1-score   support

contradiction       1.00      1.00      1.00       260
   entailment       1.00      1.00      1.00       272
      neutral       1.00      1.00      1.00       272

     accuracy                           1.00       804
    macro avg       1.00      1.00      1.00       804
 weighted avg       1.00      1.00      1.00       804



In [19]:
crep = classification_report(y_test, test_pred, target_names=le.classes_)
print(crep)

               precision    recall  f1-score   support

contradiction       0.69      0.77      0.73       126
   entailment       0.58      0.68      0.63       118
      neutral       0.64      0.49      0.56       152

     accuracy                           0.64       396
    macro avg       0.64      0.65      0.64       396
 weighted avg       0.64      0.64      0.63       396

