# Main imports and code

In [None]:
# check which gpu we're using
# !nvidia-smi

In [None]:
!pip install simpletransformers
!pip install tensorboardx
!pip install googletrans==4.0.0-rc1 
!pip install numpy requests nlpaug

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
from collections import Counter
from ast import literal_eval
from transformers import BertPreTrainedModel, BertModel, Trainer, TrainingArguments, BertTokenizer, RobertaModel, RobertaPreTrainedModel, RobertaTokenizer, DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertPreTrainedModel, DistilBertModel
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report
import pandas as pd
import logging
import torch
import torch.nn as nn
import googletrans
import nltk
from nltk.corpus import wordnet, stopwords
import random
import nlpaug.augmenter.word as naw
from nltk.corpus import wordnet, stopwords

# download words database for synonym replacement
nltk.download('stopwords')
nltk.download('wordnet')

# prepare logger
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

if not torch.cuda.is_available():
  print('WARNING: You may want to change the runtime to GPU for faster training!')
  DEVICE = 'cpu'
else:
  DEVICE = 'cuda:0'

# Fetch Don't Patronize Me! data manager module

In [20]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')

with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [21]:
from dont_patronize_me import DontPatronizeMe
dpm = DontPatronizeMe('.', './task4_test.tsv')
# load data
dpm.load_task1()
dpm.load_test()
# dpm.train_task1_df
# dpm.test_set_df

# Load paragraph IDs

In [22]:
# get labels info
train_ids = pd.read_csv('train_semeval_parids-labels.csv')
dev_ids = pd.read_csv('dev_semeval_parids-labels.csv')

# change value type
train_ids.par_id = train_ids.par_id.astype(str)
dev_ids.par_id = dev_ids.par_id.astype(str)

# train_ids.head()

## Data Preprocessing and Augmentation Methods

In [10]:
# Section 3.2 Data Processing
# Section 3.5 Data Augmentation
# Data Processing and Augmentation

# define stop words
set1 = set(stopwords.words('english'))
set2 = ["i", "am", "you", "are", "he", "she", "they", "'s", "'m", "'re", "n't", "did", "do", "done"]
stop_words = set.union(set1, set2)

# Define a class which contains all the function for data processing and augmentation
class Processor:
  def __init__(self):
    self.translator = googletrans.Translator()
  
  # Synonym replacement method 1
  def synonym_aug(self, text):
    l = len(text.split(' '))
    aug = naw.SynonymAug(aug_src='wordnet', stopwords=stop_words, aug_min=int(l * 0.2), aug_max=int(l * 0.3))
    augmented_text = aug.augment(text)
    return augmented_text

  # Synonym replacement method 2
  def synonym_replacement(self, words, rate):
    words = words.split()
    n = int(rate * len(words))
    
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word.lower() not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    
    for random_word in random_word_list:
        synonyms = self.get_synonyms(random_word)
        
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        
        if num_replaced >= n: #only replace up to n words
            break

    sentence = ' '.join(new_words)

    return sentence

  # Get a set of synonyms for a word
  def get_synonyms(self, word):
    synonyms = set()
    
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym) 
    
    if word in synonyms:
        synonyms.remove(word)
    
    return list(synonyms)

  # Data Augmentation - translate English to Espanol, and translate it back to English
  def translation_aug(self, text):
    tmp = self.Eng_to_Esp(text)
    res = self.Esp_to_Eng(tmp)
    return res

  # Data Processing
  # change words to lower case
  def process(self, text):
    return text.lower()

  def Eng_to_Esp(self, query) :
    translated_query = self.translator.translate(query, dest='es').text
    return translated_query

  def Esp_to_Eng(self, query) :
    re_translated_query = self.translator.translate(query, dest='en').text
    return re_translated_query

# Initialise the instance
processor = Processor()

# Rebuild training set & testing set

In [12]:
# Reconstruct train set
rows = []
index = 0
for idx in range(len(train_ids)):  
  # get values
  parid = train_ids.par_id[idx]
  # select row from original dataset to retrieve `text` and binary label
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]

  # Section 3.2 Data Processing
  # process
#   text = processor.process(text)

  rows.append({
    'par_id':str(index), # id
    'text':text, # sentence
    'labels':label # label
  })
  index += 1
train_df = pd.DataFrame(rows)

# Reconstruct dev set
rows = [] # will contain par_id, label and text
index = 0
for idx in range(len(dev_ids)):  
  # get values
  parid = dev_ids.par_id[idx]
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]

  # process
#   text = processor.process(text)
  rows.append({
    'par_id':str(index),
    'text':text,
    'labels':label
  })
  index += 1
dev_df = pd.DataFrame(rows)

# Reconstruct test set
rows = []
index = 0
for idx in range(len(dpm.test_set_df)):
  # get values
  parid = dpm.test_set_df.par_id[idx]
  text = dpm.test_set_df.loc[dpm.test_set_df.par_id == parid].text.values[0]
  # process
#   text = processor.process(text)

  rows.append({
    'par_id':str(index),
    'text':text
  })
  index += 1
test_df = pd.DataFrame(rows)


# Analysis of Data set for Section 2 in report

In [16]:
# Section 2
# Analysis the input length
# count = 0
# all_df = pd.concat([train_df, dev_df, test_df])
# for idx, row in all_df.iterrows():
#     l = len(row['text'].split(' '))
#     if l <= 128: # smaller than the max token length
#         count += 1

14047
14301
0.982239004265436


In [None]:
# Section 2.1 Input length analysis
# keywords are analysised using excel directly, thus, no code for this part

# Calculate the mean, std of the data set
# from statistics import mean, stdev

# train_length_list = []
# for idx, row in train_df.iterrows():
#   text = row['text']
#   train_length_list.append(len(text.split(' ')))

# dev_length_list = []
# for idx, row in dev_df.iterrows():
#   text = row['text']
#   dev_length_list.append(len(text.split(' ')))

# test_length_list = []
# for idx, row in test_df.iterrows():
#   text = row['text']
#   test_length_list.append(len(text.split(' ')))

# print('Train: ', mean(train_length_list), '-', stdev(train_length_list))
# print('Dev: ', mean(dev_length_list), '-', stdev(dev_length_list))
# print('Test: ', mean(test_length_list), '-', stdev(test_length_list))


# Data Augmentation

We tried three different methods for augmentation as Section 3.5 mentoned
But only oversampling is used for final model. We just keep the code and comment it to show we indeed experiments these methods.

In [None]:
# Section 3.5 Data Augmentation

# Data Augmentation - back translation
# Back translation is time consuming, due to the api request limit
# Thus, we save the augmented data

# import time

# pos_train_df = train_df[train_df.label == 1]
# rows = []
# print(len(pos_train_df))

# for index, row in pos_train_df.iterrows():
#   time.sleep(1)
#   if index % 50 == 0:
#     print(index)
#   parid = row['par_id']
#   og_text = row['text']
#   label = row['label']
  
#   tmp = processor.augmentation(og_text)
#   aug_text = processor.process(tmp)
  
#   rows.append({
#     'par_id':parid,
#     'text':aug_text,
#     'label':label
#   })
# aug_train_df = pd.DataFrame(rows)
# aug_train_df.to_csv('aug_pos.csv')


In [24]:
# # load augmented_data
# tmp_df = pd.read_csv('aug_pos.csv')

# # Reconstruct aug train set 1 (back translation)
# rows = []
# index = len(train_df)
# for idx, row in tmp_df.iterrows():
#   text = row['text']
#   label = row['labels']
#   # process
#   text = processor.process(text)
#   rows.append({
#     'par_id':str(index),
#     'text':text,
#     'labels':label
#   })
#   index += 1
# aug_train_df1 = pd.DataFrame(rows)

# # Reconstruct aug train set 2 (synonym replacement)
# train_pos_df = train_df[train_df.labels==1]
# rows = []
# index = len(train_df) + len(aug_train_df1)
# for idx, row in train_pos_df.iterrows():
#   text = row['text']
#   label = row['labels']
#   # process
#   text = processor.process(text)
#   aug_text = processor.synonym_replacement(text, 0.2)
#   rows.append({
#     'par_id':str(index),
#     'text':aug_text,
#     'labels':label
#   })
#   index += 1
# aug_train_df2 = pd.DataFrame(rows)

# get positive instances (original + augmented)
train_pos_df = train_df[train_df.labels==1]
new_train_pos_df = pd.concat([train_pos_df])#, aug_train_df2])#, aug_train_df2])

# oversampling positive instances
rows = []
index = len(new_train_pos_df)
# shuffle the positive train data frame
train_pos_df.sample(frac=1)
for idx, row in train_pos_df[:int(len(train_pos_df) * 1)].iterrows():
  text = row['text']
  label = row['labels']
  rows.append ({
    'par_id':str(index),
    'text':text,
    'labels':label
  })
  index += 1
tmp = pd.DataFrame(rows)
new_train_pos_df = pd.concat([new_train_pos_df, tmp])
n_pos = len(new_train_pos_df)
print('pos:', n_pos)

# get negative instances
train_neg_df = train_df[train_df.labels == 0]
train_neg_df.sample(frac=1)
# train_neg_df = train_neg_df[:int(n_pos * 2)]

# # get partial instances
# n_pos = int(len(train_neg_df) / 2)
# # half of the data will be augmented (synonym replacement)
# train_neg_sub_df1 = train_neg_df[:n_pos]
# train_neg_sub_df2 = train_neg_df[n_pos:]
# rows = []
# for idx, row in train_neg_sub_df1.iterrows():
#   par_id = row['par_id']
#   text = row['text']
#   label = row['labels']
#   new_text = processor.synonym_replacement(text, 0.5)
#   # print(row['labels'])
#   rows.append ({
#       'pair_id': par_id,
#       'text': new_text,
#       'labels': label
#       })

# train_neg_sub_df1 = pd.DataFrame(rows)
# train_neg_df = pd.concat([train_neg_sub_df1, train_neg_sub_df2])
print('neg:', len(train_neg_df))

# build train set data frame
train_df = pd.concat([new_train_pos_df, train_neg_df])
train_df
print('total:', len(train_df))

# set 'par_id' as index for each row
train_df.set_index('par_id', inplace=True)
dev_df.set_index('par_id', inplace=True)

pos: 1588
neg: 7581
total: 9169


# Dataset, Models and Loss function

In [25]:
import numpy
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Define Dataset
class DatasetPCL(torch.utils.data.Dataset):
  # Initialisation
  # input_set: dataset. train set and dev set contain 'text' and 'label'; test set contain 'text' only
  # tokenizer: tokenizer for tokenisation, which should be match with model
  # max_len: max length of token
  def __init__(self, input_set, tokenizer, max_len=256):
    self.tokenizer = tokenizer
    self.texts = input_set['text']
    self.max_len = max_len
    
    if 'labels' in input_set:
      self.labels = input_set['labels']
      self.has_labels = True # train set
    else:
      self.labels = None
      self.has_labels = False # test set

  # Encoding train set (only called during training)
  # batch: data mini-batch
  def collate_fn(self, batch):
    texts = []
    labels = []

    for b in batch:
        texts.append(b['text'])
        labels.append(b['labels'])

    #The maximum sequence size for BERT is 512 but here the tokenizer truncate sentences longer than 128 tokens.  
    # We also pad shorter sentences to a length of 128 tokens
    encodings = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=self.max_len)
#     encodings = self.tokenizer(texts, padding=True, truncation=True, max_length=self.max_len)
    encodings['labels'] =  torch.tensor(labels)
    
    return encodings
  
  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    if self.has_labels:
      item = {'text': self.texts[idx],
              'labels': self.labels[idx]
      }
    else:
      item = {'text': self.texts[idx]}
    return item

# DistilBert Model inherient from Pre-trained Model
class DistilBertPCL(DistilBertPreTrainedModel):

  # Initialisation
  # config: pre-trained model config (model name)
  # num_labels: number of classes
  def __init__(self, config, dropout_rate=0.2):
    super(DistilBertPCL, self).__init__(config)
    self.num_labels = 2

    self.distilbert = DistilBertModel(config)
#     self.pre_classifier = nn.Linear(config.dim, config.dim)
    self.dropout = nn.Dropout(dropout_rate)
    self.classifier = nn.Linear(config.dim, 2)

    self.init_weights()

  def forward(self, input_ids=None, attention_mask=None, head_mask=None, labels=None):
    distilbert_output = self.distilbert(input_ids=input_ids,
                             attention_mask=attention_mask,
                             head_mask=head_mask)
    hidden_state = distilbert_output[0]                    # (bs, seq_len, dim)
    pooled_output = hidden_state[:, 0]                    # (bs, dim)
#     pooled_output = self.pre_classifier(pooled_output)   # (bs, dim)
#     pooled_output = nn.ReLU()(pooled_output)             # (bs, dim)
    pooled_output = self.dropout(pooled_output)         # (bs, dim)
    logits = self.classifier(pooled_output)              # (bs, dim)

    return logits

# Section 3.1 Implementation
# Roberta Model inherient from Pre-trained Model
class RobertaPCL(RobertaPreTrainedModel):

  # Initialisation
  # config: pre-trained model config (model name)
  # dropout_rate: dropout rate
  def __init__(self, config, dropout_rate=0.2):
    super().__init__(config)

    # load BERT Model
    self.roberta = RobertaModel(config)
    
    # add dropout layer and linear layer for final classification
    self.projection = torch.nn.Sequential(
      torch.nn.Dropout(dropout_rate),
      torch.nn.Linear(config.hidden_size, 2)
    )
    
    self.init_weights()

  def forward(
    self,
    input_ids=None,
    attention_mask=None,
    token_type_ids=None,
    position_ids=None,
    head_mask=None,
    inputs_embeds=None,
    labels=None,
    output_attentions=None,
    output_hidden_states=None,
    return_dict=None):

    outputs = self.roberta(
      input_ids,
      attention_mask=attention_mask,
      token_type_ids=token_type_ids,
      position_ids=position_ids,
      head_mask=head_mask,
      inputs_embeds=inputs_embeds,
      output_attentions=output_attentions,
      output_hidden_states=output_hidden_states,
      return_dict=return_dict,
    )

    logits = self.projection(outputs[1])        
    return logits

# Section 3.6 Weighted Loss Function
# Loss function 
class TrainerPCL(Trainer):
  def compute_loss(self, model, inputs, return_outputs=False):
    # get labels
    labels = inputs.get('labels')
    outputs = model(**inputs)

    # cross entropy loss
    weights = [0.5, 1]
    class_weights = torch.FloatTensor(weights).cuda()
    loss_cel = nn.CrossEntropyLoss(weight=class_weights)
#     loss_cel = nn.CrossEntropyLoss()
    loss = loss_cel(outputs.view(-1, 2), labels.view(-1))

    return (loss, outputs) if return_outputs else loss

In [26]:
from transformers import EarlyStoppingCallback
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
global_batch_size = 32

# Function to compute metrics for evaluation
def compute_metrics(p):
    # p is calculated insider trainer, which will output one less prediction for
    # each batch. Thus, ignore 1 label in each batch 
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    tmp = []
    count = 0
    for label in labels:
        if count == 0:
            count += 1
            continue
        else:
            tmp.append(label)
            count += 1
        if count == global_batch_size:
            count = 0
                
    accuracy = accuracy_score(y_true=tmp, y_pred=pred)
    recall = recall_score(y_true=tmp, y_pred=pred)
    precision = precision_score(y_true=tmp, y_pred=pred)
    f1 = f1_score(y_true=tmp, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


# Main program
class Program:
  def __init__(self, model, tokenizer):
    self.model = model
    self.tokenizer = tokenizer
    self.set_params
    
  # Set hyperparameters
  def set_params(self, lr=0.0001, batch_size=32, num_epochs=10, max_len=256, early_stop=20, warmup_steps=4000, eval_steps=100, 
                 log_steps=100):
    self.lr = lr # learning rate
    self.batch_size = batch_size # batch size
    self.num_epochs = num_epochs # number of epoches
    self.max_len = max_len # max length of token
    self.log_steps = log_steps # log printing interval
    # Section 3.4 Early-Stopping
    self.eval_steps = eval_steps # evaluation interval
    self.warmup_steps = warmup_steps # starting steps for early stopping
    self.early_stopping = early_stop # early stopping patient
        
  # Train
  # dataset: train set
  def train(self, train_dataset, eval_dataset):

    # learning parameters
    training_args = TrainingArguments(
#         f"training_with_callbacks",
        output_dir='./experiment/pcl',
        learning_rate=self.lr,
        per_device_train_batch_size=self.batch_size,
        per_device_eval_batch_size=self.batch_size,
        num_train_epochs=self.num_epochs,
        logging_steps=self.log_steps,
        weight_decay=0.01,
        # Section 3.4 Early-Stopping
        evaluation_strategy='steps', # evaluated on "steps" instead of "epochs"
        eval_steps=self.eval_steps,
        warmup_steps=self.warmup_steps,
#         save_total_limit = 5, # last 5 models are saved.
        metric_for_best_model='f1', # evaluation criteria
        load_best_model_at_end=True
    )

    # load trainier (loss function)
    trainer = TrainerPCL(
        model=self.model,                         
        args=training_args,                 
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=train_dataset.collate_fn,
        compute_metrics=compute_metrics,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=self.early_stopping)]   
    )

    # training
    trainer.train()
    trainer.save_model('./models/ht_bert_finetuned/')

  # Prediction
  # text: single paragraph
  def predict(self, text):
    encodings = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=self.max_len)
    encodings.to(DEVICE)
    
    output = self.model(**encodings)
    preds = torch.max(output, 1)

    return {'prediction': preds[1], 'confidence': preds[0]}

  # Test
  # dataset: test set
  def test(self, dataset):
    self.model.eval()
    data_loader = DataLoader(dataset)
    predictions = []
    
    with torch.no_grad():
      for data in tqdm(data_loader): 
        text = data['text']
        pred = self.predict(text)
        predictions.append(pred['prediction'].tolist())
        
    return predictions
    
  # Evaluation
  # dataset: dev set
  def eval(self, dataset):
    self.model.eval()
    data_loader = DataLoader(dataset)
    predictions = []
    labels = []

    with torch.no_grad():
      for data in tqdm(data_loader): 
        text = data['text']
        label = data['labels']
        pred = self.predict(text)
        predictions.append(pred['prediction'].tolist())
        labels.append(label.tolist())

      # with the saved predictions and labels we can compute accuracy, precision, recall and f1-score
      report = classification_report(labels, predictions, target_names=["negative", "positive"],
                                            output_dict=True)

      return predictions, labels, report
  
  # Save predictions as file
  # p: predictions list
  # outf_path: file path
  def labels2file(self, p, outf_path):
    with open(outf_path,'w') as outf:
      for pi in p:
        outf.write(','.join([str(k) for k in pi])+'\n')




In [30]:
### Roberta
# # load token
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# # load pre-trained model
model = RobertaPCL.from_pretrained('roberta-base', 0.2)
# # load training set & testing set

# tokenizer = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")

# model = RobertaPCL.from_pretrained("siebert/sentiment-roberta-large-english")

### DistilBert
# load token
# tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
# load pre-trained model
# model = DistilBertPCL.from_pretrained("distilbert-base-uncased", 0.5)

# Section 3.3 Hyperparameter
max_len = 128
global_batch_size = 32

# define data set
train_dataset = DatasetPCL(train_df, tokenizer, max_len)
dev_dataset = DatasetPCL(dev_df, tokenizer, max_len)
test_dataset = DatasetPCL(test_df, tokenizer, max_len)

# training
program = Program(model, tokenizer)

# We use a large early stopping patient value (15). Some times it could finish all the epochs.
# But the final mode returned is the best one (highest F1-score) during evaluation. Thus, it can be still 
# considered as the training step is already stopped at that steps.
# This RUN TIME stopped at checkpoint-3500 (score: 0.5739514348785872).
program.set_params(lr=1e-5, batch_size=global_batch_size, num_epochs=20, max_len=max_len, early_stop=15, warmup_steps=8000)
program.train(train_dataset, dev_dataset)

loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,0.7383,0.759093,0.094675,0.094675,1.0,0.172973
200,0.7276,0.733979,0.094675,0.094675,1.0,0.172973
300,0.7021,0.69382,0.439349,0.092321,0.557292,0.158401
400,0.6756,0.619743,0.905325,0.0,0.0,0.0
500,0.6232,0.479187,0.905325,0.0,0.0,0.0
600,0.5996,0.392269,0.905325,0.0,0.0,0.0
700,0.5588,0.340883,0.905325,0.0,0.0,0.0
800,0.4864,0.293755,0.886095,0.405797,0.4375,0.421053
900,0.4293,0.291698,0.864398,0.37386,0.640625,0.472169
1000,0.3814,0.264948,0.875247,0.39661,0.609375,0.480493


***** Running Evaluation *****
  Num examples = 2094
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2094
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2094
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2094
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2094
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./experiment/pcl/checkpoint-500
Configuration saved in ./experiment/pcl/checkpoint-500/config.json
Model weights saved in ./experiment/pcl/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2094
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2094
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2094
  Batch size = 32
***** Running

In [31]:
predictions = program.test(test_dataset)
program.labels2file(predictions, 'task1.txt')

  0%|          | 0/3832 [00:00<?, ?it/s]

In [32]:
# Section 3.7 Result
_, _, report = program.eval(train_dataset)
report

  0%|          | 0/9169 [00:00<?, ?it/s]

{'negative': {'precision': 0.9989232839838492,
  'recall': 0.9790265136525524,
  'f1-score': 0.9888748251282393,
  'support': 7581},
 'positive': {'precision': 0.9085681426106959,
  'recall': 0.9949622166246851,
  'f1-score': 0.94980462879471,
  'support': 1588},
 'accuracy': 0.9817864543570728,
 'macro avg': {'precision': 0.9537457132972725,
  'recall': 0.9869943651386188,
  'f1-score': 0.9693397269614746,
  'support': 9169},
 'weighted avg': {'precision': 0.9832744711906801,
  'recall': 0.9817864543570728,
  'f1-score': 0.982108168810468,
  'support': 9169}}

In [33]:
dev_dataset = DatasetPCL(dev_df, tokenizer, 128)
predictions, labels, report = program.eval(dev_dataset)
report

  0%|          | 0/2094 [00:00<?, ?it/s]

{'negative': {'precision': 0.964873765093304,
  'recall': 0.9277044854881267,
  'f1-score': 0.9459241323648103,
  'support': 1895},
 'positive': {'precision': 0.4963235294117647,
  'recall': 0.678391959798995,
  'f1-score': 0.5732484076433121,
  'support': 199},
 'accuracy': 0.9040114613180515,
 'macro avg': {'precision': 0.7305986472525343,
  'recall': 0.8030482226435609,
  'f1-score': 0.7595862700040612,
  'support': 2094},
 'weighted avg': {'precision': 0.9203458296106745,
  'recall': 0.9040114613180515,
  'f1-score': 0.9105074803974855,
  'support': 2094}}

# Analysis of Input Features

In [None]:
# Section 4 Analysis
from itertools import chain

# Reconstruct dev set for section 4
rows = [] # will contain par_id, label and text
for idx in range(len(dev_ids)):  
  # get values
  parid = dev_ids.par_id[idx]
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
  orig_label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].orig_label.values[0]
  keyword = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].keyword.values[0]
  country = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].country.values[0]
  # process
#   text = processor.process(text)
  rows.append({
    'par_id':parid,
    'keyword':keyword,
    'country':country,
    'text':text,
    'labels':label,
    'orig_labels':orig_label
  })
  index += 1
dev_df = pd.DataFrame(rows)

predictions = list(chain.from_iterable(predictions))
dev_df['predictions'] = predictions

In [None]:
# Section 4.1 Level of Patronising
c_names = ['T', 'F']
r_names = ['0', '1', '2', '3', '4']
res = pd.DataFrame(columns = c_names, index=r_names)
for c in c_names:
    for r in r_names:
        res.loc[r, c] = 0


for idx, row in dev_df.iterrows():
    orig_label = row['orig_labels']
    pred = row['predictions']
    label = row['labels']
    
    if label == pred:
        res.loc[orig_label, 'T'] += 1
    else:
        res.loc[orig_label, 'F'] += 1

for idx, row in res.iterrows():
#     print(row)
    t = row['T']
    f = row['F']
    a = t + f
    res.loc[idx, 'Accuracy'] = t / a
res

# forgot to delete print info :(

# row names: original label
# column names: T - correct predicton, F - wrong prediction, Accuracy - Accuracy

T    1637
F      67
Name: 0, dtype: object
T    140
F     51
Name: 1, dtype: object
T     3
F    15
Name: 2, dtype: object
T    50
F    39
Name: 3, dtype: object
T    70
F    22
Name: 4, dtype: object


Unnamed: 0,T,F,Accuracy
0,1637,67,0.960681
1,140,51,0.732984
2,3,15,0.166667
3,50,39,0.561798
4,70,22,0.76087


In [None]:
# Section 4.2 Input Length
# Negative Examples
interval = 20
c_names = ['T', 'F']
r_names = [str(n) for n in range(272 // interval + 1)]
res = pd.DataFrame(columns = c_names, index = r_names)
for c in c_names:
    for r in r_names:
        res.loc[r, c] = 0
# print(res)

sub_dev_df = dev_df[dev_df.labels == 0]
for idx, row in sub_dev_df.iterrows():
    l = len(row['text'].split(' '))
    n = l // interval
    orig_label = row['orig_labels']
    pred = row['predictions']
    label = row['labels']
    
    if label == pred:
        res.loc[str(n), 'T'] += 1
    else:
        res.loc[str(n), 'F'] += 1

for idx, row in res.iterrows():
    t = row['T']
    f = row['F']
    a = t + f
    res.loc[idx, 'Accuracy'] = t / a if a > 0 else 'NaN'
res
# row names: input length interval. e.g. 0 means [1, 20], 1 means [21, 40], 2 means [41, 60] and so on
# column names: T - correct predicton, F - wrong prediction, Accuracy - Accuracy

Unnamed: 0,T,F,Accuracy
0,146,9,0.941935
1,700,39,0.947226
2,536,30,0.946996
3,222,20,0.917355
4,86,13,0.868687
5,52,4,0.928571
6,19,0,1.0
7,12,2,0.857143
8,2,1,0.666667
9,1,0,1.0


In [None]:
# Section 4.2 Input Length
# Positive Examples
c_names = ['T', 'F']
r_names = [str(n) for n in range(272 // interval + 1)]
res = pd.DataFrame(columns = c_names, index = r_names)
for c in c_names:
    for r in r_names:
        res.loc[r, c] = 0
# print(res)

sub_dev_df = dev_df[dev_df.labels == 1]
for idx, row in sub_dev_df.iterrows():
    l = len(row['text'].split(' '))
    n = l // interval
    orig_label = row['orig_labels']
    pred = row['predictions']
    label = row['labels']
    
    if label == pred:
        res.loc[str(n), 'T'] += 1
    else:
        res.loc[str(n), 'F'] += 1

for idx, row in res.iterrows():
    t = row['T']
    f = row['F']
    a = t + f
    res.loc[idx, 'Accuracy'] = t / a if a > 0 else 'NaN'
res
# row names: input length interval. e.g. 0 means [1, 20], 1 means [21, 40], 2 means [41, 60] and so on
# column names: T - correct predicton, F - wrong prediction, Accuracy - Accuracy

Unnamed: 0,T,F,Accuracy
0,6,5,0.545455
1,39,27,0.590909
2,36,18,0.666667
3,20,16,0.555556
4,10,8,0.555556
5,6,0,1.0
6,3,2,0.6
7,1,0,1.0
8,2,0,1.0
9,0,0,


In [None]:
# Section 4.3 Keyword
# Negative Examples
dev_df
c_names = ['T', 'F']
r_names = list(set(dev_df['keyword']))
res = pd.DataFrame(columns = c_names, index=r_names)
for c in c_names:
    for r in r_names:
        res.loc[r, c] = 0

sub_dev_df = dev_df[dev_df.labels == 0]        
for idx, row in sub_dev_df.iterrows():
    orig_label = row['orig_labels']
    pred = row['predictions']
    label = row['labels']
    keyword = row['keyword']
    
    if label == pred:
        res.loc[keyword, 'T'] += 1
    else:
        res.loc[keyword, 'F'] += 1

for idx, row in res.iterrows():
    t = row['T']
    f = row['F']
    a = t + f
    res.loc[idx, 'All'] = int(a)
    res.loc[idx, 'Accuracy'] = t / a
res

# row names: keywords
# column names: T - correct predicton, F - wrong prediction, All - all numbers in this category, Accuracy - Accuracy

Unnamed: 0,T,F,All,Accuracy
immigrant,209,2,211.0,0.990521
women,207,12,219.0,0.945205
migrant,200,2,202.0,0.990099
poor-families,133,19,152.0,0.875
vulnerable,178,11,189.0,0.941799
in-need,174,19,193.0,0.901554
hopeless,174,17,191.0,0.910995
homeless,167,16,183.0,0.912568
disabled,169,11,180.0,0.938889
refugee,166,9,175.0,0.948571


In [None]:
# Section 4.3 Keyword
# Positive Examples
dev_df
c_names = ['T', 'F']
r_names = list(set(dev_df['keyword']))
res = pd.DataFrame(columns = c_names, index=r_names)
for c in c_names:
    for r in r_names:
        res.loc[r, c] = 0

sub_dev_df = dev_df[dev_df.labels == 1]        
for idx, row in sub_dev_df.iterrows():
    orig_label = row['orig_labels']
    pred = row['predictions']
    label = row['labels']
    keyword = row['keyword']
    
    if label == pred:
        res.loc[keyword, 'T'] += 1
    else:
        res.loc[keyword, 'F'] += 1

for idx, row in res.iterrows():
    t = row['T']
    f = row['F']
    a = t + f
    res.loc[idx, 'All'] = int(a)
    res.loc[idx, 'Accuracy'] = t / a
res
# row names: keywords
# column names: T - correct predicton, F - wrong prediction, All - all numbers in this category, Accuracy - Accuracy

Unnamed: 0,T,F,All,Accuracy
immigrant,1,6,7.0,0.142857
women,8,6,14.0,0.571429
migrant,3,2,5.0,0.6
poor-families,24,14,38.0,0.631579
vulnerable,11,9,20.0,0.55
in-need,29,4,33.0,0.878788
hopeless,16,10,26.0,0.615385
homeless,17,12,29.0,0.586207
disabled,7,7,14.0,0.5
refugee,7,6,13.0,0.538462


In [None]:
# Section 4.3 Country
# Negative Examples
dev_df
c_names = ['T', 'F']
r_names = list(set(dev_df['country']))
res = pd.DataFrame(columns = c_names, index=r_names)
for c in c_names:
    for r in r_names:
        res.loc[r, c] = 0

sub_dev_df = dev_df[dev_df.labels == 0]        
for idx, row in sub_dev_df.iterrows():
    orig_label = row['orig_labels']
    pred = row['predictions']
    label = row['labels']
    country = row['country']
    
    if label == pred:
        res.loc[country, 'T'] += 1
    else:
        res.loc[country, 'F'] += 1

for idx, row in res.iterrows():
    t = row['T']
    f = row['F']
    a = t + f
    res.loc[idx, 'All'] = int(a)
    res.loc[idx, 'Accuracy'] = t / a
res
# row names: countries
# column names: T - correct predicton, F - wrong prediction, All - all numbers in this category, Accuracy - Accuracy

Unnamed: 0,T,F,All,Accuracy
ke,94,7,101.0,0.930693
lk,70,6,76.0,0.921053
sg,89,2,91.0,0.978022
tz,80,3,83.0,0.963855
ph,82,8,90.0,0.911111
pk,87,8,95.0,0.915789
in,89,8,97.0,0.917526
us,95,9,104.0,0.913462
ng,85,8,93.0,0.913978
my,103,5,108.0,0.953704


In [None]:
# Section 4.3 Country
# Positive Examples
dev_df
c_names = ['T', 'F']
r_names = list(set(dev_df['country']))
res = pd.DataFrame(columns = c_names, index=r_names)
for c in c_names:
    for r in r_names:
        res.loc[r, c] = 0

sub_dev_df = dev_df[dev_df.labels == 1]        
for idx, row in sub_dev_df.iterrows():
    orig_label = row['orig_labels']
    pred = row['predictions']
    label = row['labels']
    country = row['country']
    
    if label == pred:
        res.loc[country, 'T'] += 1
    else:
        res.loc[country, 'F'] += 1

for idx, row in res.iterrows():
    t = row['T']
    f = row['F']
    a = t + f
    res.loc[idx, 'All'] = int(a)
    res.loc[idx, 'Accuracy'] = t / a
res

# row names: countries
# column names: T - correct predicton, F - wrong prediction, All - all numbers in this category, Accuracy - Accuracy

Unnamed: 0,T,F,All,Accuracy
ke,10,3,13.0,0.769231
lk,6,3,9.0,0.666667
sg,3,1,4.0,0.75
tz,6,5,11.0,0.545455
ph,8,7,15.0,0.533333
pk,8,6,14.0,0.571429
in,4,3,7.0,0.571429
us,6,4,10.0,0.6
ng,10,5,15.0,0.666667
my,6,2,8.0,0.75


In [None]:
# import gc
# torch.cuda.empty_cache()
# gc.collect()

In [None]:
# # from transformers import RobertaTokenizer, RobertaModel
# # tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# text1 = "\" It 's been a bit of a chaotic week , a very stressful situation . "
# text2 = "\"It's been a bit of a chaotic week, a very stressful situation."
# a = tokenizer.tokenize(text1)
# b = tokenizer.tokenize(text2)
# print(a)
# print(b)