In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
%%capture
!pip install transformers
!pip install sentencepiece

In [3]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn

import transformers
from sklearn.metrics import *
from transformers import AdamW
from tqdm.notebook import tqdm
from scipy.special import softmax
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split as tts
from transformers import BertTokenizerFast, BertConfig, BertForSequenceClassification, AutoModel
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [4]:
# Define the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

#### Download

In [33]:
# pick the path
giorgios_path = "drive/My Drive/Colab Notebooks/experiments/stockholm"
johns_path = "drive/MyDrive/Resources/stockholm"
path = johns_path # giorgios_path
# pick the data source
data_source = "mohx" #mohx trofi trofix

In [34]:
data = pd.read_csv(f"{path}/updates/{data_source}.csv")

#data = pd.read_csv("drive/My Drive/Colab Notebooks/experiments/data/trofi.csv")

#the "mixed" data sources are the original metaphor datasets + the new data (reconstructed and literal sentences from Wiki and Gutenberg)
#for trofi-x, select "trofix_mixed.csv"; I already re-did everything with "moh-x_mixed.csv", as explained in my previous email
print("\nThere are", len(data), "sentences")

data["label"] = data["label"].apply(int)
data.head(2)


There are 647 sentences


Unnamed: 0,arg1,arg2,verb,sentence,verb_idx,label
0,knowledge,,absorb,He absorbed the knowledge or beliefs of his t...,1,1
1,cost,,absorb,He absorbed the costs for the accident .,1,1


#### Split to training, validation and test



In [35]:
# Split to train, val and test
train, test = tts(data[["sentence", "label"]], random_state=42, test_size=0.1)
#"test_new" is just an intermediate subset for the split (it was needed to have the "val" subset for the following steps), not the real test set for the XLM-R model's predictions
#the right test set is "test", which corresponds to the original metaphor datasets without the addition of the new data, and it is declared in the next code cell
train, val = tts(train, random_state=42, test_size=test.shape[0])
before = train.shape[0]
# using the mixed trainning set
train = pd.read_csv(f"{path}/updates/{data_source}_train_mixed.csv")
#train = pd.read_csv("drive/My Drive/Colab Notebooks/experiments/stockholm/updates/trofi_train_mixed.csv")
after = train.shape[0]
print(f"From {before} to {after} ({100*(after-before)/before:.2f}%)")

From 517 to 945 (82.79%)


In [36]:
train.head(2)

Unnamed: 0,sentence,label
0,I ca n't buy this story .,1
1,European children learn the breast stroke ; th...,0


#### Tokenize and encode with BERT tokenizer

In [37]:
# For any data source

from transformers import XLMRobertaForSequenceClassification

output_dir = f'{path}/xlm_code/mixed_models/{data_source}'
#output_dir = "drive/My Drive/Colab Notebooks/experiments/stockholm/xlm_code/mixed_models/trofi"

print(output_dir)

from transformers import XLMRobertaTokenizer
import torch
# Load the BERT tokenizer.
print('Loading XLMRobertaTokenizer...')
bert_tokenizer = XLMRobertaTokenizer.from_pretrained(output_dir)
model_e = XLMRobertaForSequenceClassification.from_pretrained(output_dir, num_labels = 2, output_attentions = True, output_hidden_states = True,)

drive/MyDrive/Resources/stockholm/xlm_code/mixed_models/mohx
Loading XLMRobertaTokenizer...


Some weights of the model checkpoint at drive/MyDrive/Resources/stockholm/xlm_code/mixed_models/mohx were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [38]:
#max_len = max([len(bert_tokenizer.encode(s)) for s in train.sentence.to_list()])

#print(max_len)
max_lengths = {"mohx":21, "trofi":161, "trofix":161}
MAX_LEN = max_lengths[data_source]

encoded_instance = bert_tokenizer.encode_plus(
            train.iloc[0].sentence,
            truncation = True,                
            add_special_tokens = True,
            max_length = MAX_LEN,     
            pad_to_max_length = True,
            return_attention_mask = True,  
            return_tensors = 'pt'
       )


encoded_instance



{'input_ids': tensor([[    0,    87,   377,   653,    25,    18, 22113,   903, 13765,     6,
             5,     2,     1,     1,     1,     1,     1,     1,     1,     1,
             1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [39]:
print("Original text:", train.iloc[0].sentence)
print("BERT BPEs:", bert_tokenizer.convert_ids_to_tokens(encoded_instance["input_ids"][0]))

Original text: I ca n't buy this story .
BERT BPEs: ['<s>', '▁I', '▁ca', '▁n', "'", 't', '▁buy', '▁this', '▁story', '▁', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [40]:
# Set max_len to the maximum length of the training data 
max_len = max([len(bert_tokenizer.encode(s)) for s in train.sentence.to_list()])
print("The maximum sentence length in training based on BERT BPEs is", max_len)

The maximum sentence length in training based on BERT BPEs is 161


In [41]:
# Tokenize and encode sentences in each set
x_train = bert_tokenizer.batch_encode_plus(
    train.sentence.tolist(),
    max_length = max_len,
    padding=True,
    truncation=True
)
x_val = bert_tokenizer.batch_encode_plus(
    val.sentence.tolist(),
    max_length = max_len,
    padding=True,
    truncation=True
)
x_test = bert_tokenizer.batch_encode_plus(
    test.sentence.tolist(),
    max_length = max_len,
    padding=True,
    truncation=True
)

In [42]:
# Convert lists to tensors
train_seq = torch.tensor(x_train['input_ids'])
train_mask = torch.tensor(x_train['attention_mask'])
train_y = torch.tensor(train.label.tolist())

val_seq = torch.tensor(x_val['input_ids'])
val_mask = torch.tensor(x_val['attention_mask'])
val_y = torch.tensor(val.label.tolist())

test_seq = torch.tensor(x_test['input_ids'])
test_mask = torch.tensor(x_test['attention_mask'])
test_y = torch.tensor(test.label.tolist())

In [43]:
batch_size = 32

# Create a dataloader for each set

# TensorDataset
train_data = TensorDataset(train_seq, train_mask, train_y)
# RandomSampler
train_sampler = RandomSampler(train_data)
# DataLoader
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

test_data = TensorDataset(test_seq, test_mask, test_y)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=1)

## Inference

#### Load the saved checkpoint

In [44]:
model_e = model_e.to(device)

#### Get predictions for test

In [45]:
# Predict for the test set and save the results
model_e.eval()
test_predictions = []
test_targets = []
test_attentions = []
test_inputs = []

for batch in test_dataloader:
  batch = [t.to(device) for t in batch]
  sent_id, mask, labels = batch
  test_targets.extend(labels.detach().cpu().numpy())
  test_inputs.append(bert_tokenizer.convert_ids_to_tokens(sent_id.detach().cpu().numpy()[0]))
  with torch.no_grad():
    # Get predictions
    outputs = model_e(sent_id, attention_mask=mask)
    output_probs = softmax(outputs.logits.detach().cpu().numpy(), axis=1)
    test_predictions.extend(np.argmax(output_probs, axis=1))
    test_attentions.append(outputs.attentions)

#### Evaluate

In [46]:
print(data_source)
print("F1:", f1_score(test_targets, test_predictions, average="binary"))
print("ACC:", accuracy_score(test_targets, test_predictions))
print("AUPR:", average_precision_score(test_targets, test_predictions))
print("PRECISION:", precision_score(test_targets, test_predictions))
print("RECALL:", recall_score(test_targets, test_predictions))
print("AUC:", roc_auc_score(test_targets, test_predictions))

mohx
F1: 0.8493150684931507
ACC: 0.8307692307692308
AUPR: 0.7983945483945484
PRECISION: 0.8378378378378378
RECALL: 0.8611111111111112
AUC: 0.8271072796934866


In [None]:
{
"mohx":{
    "F1": 0.8493150684931507,
    "ACC": 0.8307692307692308,
    "AUPR": 0.7983945483945484,
    "PRECISION": 0.8378378378378378,
    "RECALL": 0.8611111111111112,
    "AUC": 0.8271072796934866,
},
"trofi":{
    "F1": 0.9319727891156463,
    "ACC": 0.946524064171123,
    "AUPR": 0.8980252411960712,
    "PRECISION": 0.9383561643835616,
    "RECALL": 0.9256756756756757,
    "AUC": 0.9429263334130591
},
"trofix":{"F1": 0.9612403100775193,
          "ACC": 0.9655172413793104,
          "AUPR": 0.9447281167108753,
          "PRECISION": 0.96875,
          "RECALL": 0.9538461538461539,
          "AUC": 0.9644230769230769}
}