In [None]:
! pip install torch
!pip install transformers
!pip install sentencepiece
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 45.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.5 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 48.1 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 30.9 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
  

In [None]:
import torch
import transformers
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt 

from transformers import Trainer, TrainingArguments
from transformers import DebertaTokenizer, DebertaForSequenceClassification

import pandas as pd
import numpy as np
import os
from urllib import request

from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.utils import class_weight

torch.cuda.empty_cache()

In [None]:
if not torch.cuda.is_available():
    print('WARNING: You may want to change the runtime to GPU for faster training!')
    DEVICE = 'cpu'
    cuda_available = False
else:
    DEVICE = 'cuda:0'
    cuda_available = True

In [None]:
# Mount drive

from google.colab import drive
from pathlib import Path
%load_ext google.colab.data_table
content_path = '/content/drive/MyDrive/NLP_cw'
data_path = './data/'
drive.mount('/content/drive/')
content_path = Path(content_path)

%cd '/content/drive/MyDrive/NLP_cw/'

Mounted at /content/drive/
/content/drive/.shortcut-targets-by-id/1lHnrZzxZhhAys320px7phwzEyWp5SX75/NLP_cw


In [None]:
# wandb hyperparameter optimisation toggle, set to true if running wandb
wandb_tog = False
if wandb_tog:
    !pip install wandb --upgrade
else:
    os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Read in the dataset to train on
train_df = pd.read_csv(
    "dontpatronizeme_pcl_train_augmented_clean_preprocess_lemma_reorg.csv", 
    index_col=0, 
    encoding="utf8"
    ) # Change name and path for testing
train_labels = train_df.label.to_numpy()

# Compute the weights of the classes for rewighing to counter unbalanced classes
class_weights = class_weight.compute_class_weight(
    class_weight='balanced', 
    classes=np.unique(train_labels), 
    y=train_labels
    )
class_weights = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)
print(class_weights)

tensor([0.5524, 5.2690], device='cuda:0')


In [None]:
# Extracting the orignal dataset from our backtranslated, larger dataset
# train_df_nobet = train_df[train_df["lang"] == "en"]
# train_df_nobet

In [None]:
def reader_train(data):
    '''Function to prepare input data into sentence-label pairs for training.

    Args:
        data (pandas dataframe): input data to process
    Returns:
        dictionary with keys texts and labels, with values being the actual text
        and associated label.
    '''
    texts = []
    labels = []
    for idx, line in data.iterrows():
      texts.append(line['text'])
      labels.append(line['label'])     

    return {'texts':texts, 'labels':labels}

In [None]:
# Control iteration and batches
class PatronizeDataset(torch.utils.data.Dataset):
    ''' Prepare dataset to feed into a hugging face model.
    '''

    def __init__(self, tokenizer, input_set):
        '''
        Args:
          tokenizer: DeBERTa tokenizer
          input_sets (pandas df): dataframe to prepare for huggingface model
        '''

        super().__init__()

        self.tokenizer = tokenizer
        self.texts = input_set['texts']
        self.labels = input_set['labels']
        

    def collate_fn(self, batch):
        ''' Encode text and labels for the hugging face model.

        Args:
            batch (int): batch size

        Returns:
            encodings: encoded batch of data for the model
        '''

        texts = []
        labels = []

        for b in batch:
            texts.append(b['texts'])
            labels.append(b['labels'])

        # The maximum sequence size for BERT is 512 but here the tokenizer 
        # truncate sentences longer than 128 tokens.  
        # We also pad shorter sentences to a length of 128 tokens
        encodings = self.tokenizer(texts, 
                                   return_tensors='pt', 
                                   padding=True, 
                                   truncation=True, 
                                   max_length=128)
        encodings['labels'] =  torch.tensor(labels)
        
        return encodings
    

    def __len__(self):
        return len(self.texts)


    def __getitem__(self, idx):
       
        item = {'texts': self.texts[idx],
                'labels': self.labels[idx]}
        return item


In [None]:
# Initialise the DeBERTa toeknizer
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/474 [00:00<?, ?B/s]

In [None]:
# Prepare the data for training
train_set = reader_train(train_df)
train_dataset = PatronizeDataset(tokenizer, train_set)

# Preparing the orignal dataset from training from the larger backtranslated dataset
# train_set_nobet = reader_train(train_df_nobet)
# train_dataset_nobet = PatronizeDataset(tokenizer, train_set_nobet)

In [None]:
# Prepare the val/test set to evaluate model on
val_df = pd.read_csv("dontpatronizeme_pcl_test.csv", 
                     index_col=0, 
                     encoding="utf8") ## Change name and path for testing

# We added a fake column to the official test set so we do not have to change 
# above implementations
# For calculating predictions. 
# labels_list = [0]*3828         # Ignore if using a val/ test set with gold labels
# val_df.insert(loc = 1,column='label', value=0)           # Ignore if using a val/ test set with gold labels

val_df = val_df.dropna()
val_set = reader_train(val_df)
val_dataset = PatronizeDataset(tokenizer, val_set)
val_loader = DataLoader(val_dataset)

In [None]:
def predict_patronize(input, tokenizer, model): 
    ''' Function to predict labels using trained model.

    Args:
        input (text): Sentance for which we need to predict label
        tokrnizer : DeBERTa tokenizer that we use
        model (HuggingFace Model): trained model used for predictions

    Returns:
        dictwith keys prediction and confidence, and values the predicted label 
        and the confidence of prediction
    '''
    
    model.eval()
    encodings = tokenizer(input, 
                          return_tensors='pt', 
                          padding=True, 
                          truncation=True, 
                          max_length=128)

    output = model(**encodings)
    logits = output.get('logits')
    preds = torch.max(logits, 1)

    return {'prediction':preds[1], 'confidence':preds[0]}

In [None]:
def evaluate(model, tokenizer, data_loader):
    ''' Function to perform evluation of metrics based on predicted labels. 

    We will compare the predicted labels from our trained model to the gold 
    labels.

    Args:
        model (HuggingFace model): trained model used for predictions
        tokenizer : DeBERTa tokenizer
        data_loader (DataLoader object): iterates the data to be fed into the eval loop

    Returns:
        report (dict): dict with the key statistics, including F1 for positive 
        class
        preds (list): list of predicted labels
        tot_labels (list): list of true labels
        datas (list): list of sentences associated with the tur and predicted labels
    
    '''

    total_count = 0
    correct_count = 0 

    preds = []
    tot_labels = []
    datas = []

    with torch.no_grad():
        for data in tqdm(data_loader): 

        
            labels_dict = {}
            labels_dict['labels'] = data['labels']

            text = data['texts']
            datas.append(text)

            pred = predict_patronize(text, tokenizer, model)

            preds.append(pred['prediction'].tolist())
            tot_labels.append(labels_dict['labels'].tolist())
 
    report = classification_report(tot_labels, 
                                   preds, 
                                   target_names=["Not Patronizing", 
                                                 "Patronizing"], 
                                   output_dict= True)

    return report, preds, tot_labels, datas

In [None]:
class Trainer_patronizing(Trainer):
    # Class to initialise the loss function and apply weighing
    
    def compute_loss(self, model, inputs):
        ''' Function to compute the loss.
        Args:
            model (Huggingface Model): model being used
            inputs (pandas data): data being used
        
        Returns:
            loss (int): the calculated loss
        '''

        labels = inputs.pop('labels')

        outputs = model(**inputs, labels=labels)
        logits = outputs.get("logits")

        criterion = nn.CrossEntropyLoss(weight=class_weights)
        loss = criterion(logits.view(-1, self.model.config.num_labels), 
                         labels.view(-1))        
        return loss

In [None]:
if wandb_tog:
    import wandb
    import logging

    #Login to wandb
    wandb.login()

    #Set hyperparameters to change in sweep
    sweep_config = {
        "name": "vanilla-sweep",
        "method": "bayes",
        "metric": {"name": "f1_score", "goal": "maximize"},
        "parameters": {
            "num_train_epochs": {"min": 1, "max": 10},
            "learning_rate": {
                "values":[1.5e-5, 2e-5, 3e-5, 4e-5]
            },
            "train_batch_size": {
                "values":[16, 32]
            },
        },
        "early_terminate": {"type": "hyperband", "min_iter": 3,},
    }

    #Set sweep id
    sweep_id = wandb.sweep(sweep_config, 
                           project="DeBERTa (HF) - Hyperparameter Optimization")

In [None]:
def main_patronize():
    ''' Function to train the model and save it. Evaluates against val set if
    tuning hyperparameters with wandb.

    '''
    # Function that trains the model and saves it.
    
    #If optimising set hyperparameters according to sweep, else use defaults
    if wandb_tog:
        wandb.init()
        learning_rate = wandb.config.learning_rate
        per_device_train_batch_size = wandb.config.train_batch_size
        num_train_epochs = wandb.config.num_train_epochs
    else:
        learning_rate = 0.00004
        per_device_train_batch_size = 32
        num_train_epochs = 9

    # Specify the model. We use DeBERTa-base
    model = DebertaForSequenceClassification.from_pretrained(
        "microsoft/deberta-base"
        )
    
    # Initialise the training args. With WandB optimization, optimal are 9 
    # epochs, lr = 4e-5 and batch_size = 32. Rest are default args.
    training_args = TrainingArguments(
        output_dir='./',
        learning_rate=learning_rate,
        warmup_steps=100,
        lr_scheduler_type="linear",
        logging_steps=100,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
        adam_beta1=0.9,
        adam_beta2=0.999,
        adam_epsilon=1e-6,
        max_grad_norm=1.0, 
        save_steps=2500
    )

    # Prepare trainer
    trainer = Trainer_patronizing(
        model=model,                         
        args=training_args,                 
        train_dataset=train_dataset,                   
        data_collator=train_dataset.collate_fn
    )

    # Train and save model
    trainer.train()
    trainer.save_model('./')

    #If optimising, evaluate and log f1_score
    if wandb_tog:
        model_name = './' 
        model = DebertaForSequenceClassification.from_pretrained(model_name)
        report = evaluate(model, tokenizer, val_loader)
        f1 = report['Patronizing']['f1-score']

        wandb.log({"f1_score": f1})  
        wandb.join()

In [None]:
# Clear cuda memory 
torch.cuda.empty_cache()

# Perform model training, using sweep if optimising
if wandb_tog:
    wandb.agent(sweep_id, function=main_patronize, count=10)
else:
    main_patronize()

Downloading:   0%|          | 0.00/533M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'pooler.d

Step,Training Loss


KeyboardInterrupt: ignored

In [None]:
# Initialise model and prepare for evaluation
model_name = './' 
model = DebertaForSequenceClassification.from_pretrained(model_name)

# Evaluate model/ make predictions
report, preds, trues, datas = evaluate(model, tokenizer, val_loader)

print(report)
print(report['accuracy'])
print(report['Not Patronizing']['f1-score'])
print(report['Patronizing']['f1-score'])

loading configuration file ./config.json
Model config DebertaConfig {
  "_name_or_path": "microsoft/deberta-base",
  "architectures": [
    "DebertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size": 0,
  "vocab_size": 50265
}

loading weights file ./pytorch_model.bin
All model checkpoint weights were used when initializing DebertaForSequenceClassification.

All the wei

  0%|          | 0/2093 [00:00<?, ?it/s]

{'Not Patronizing': {'precision': 0.9447599380485286, 'recall': 0.9662090813093981, 'f1-score': 0.9553641346906814, 'support': 1894}, 'Patronizing': {'precision': 0.5897435897435898, 'recall': 0.4623115577889447, 'f1-score': 0.5183098591549297, 'support': 199}, 'accuracy': 0.9182990922121357, 'macro avg': {'precision': 0.7672517638960592, 'recall': 0.7142603195491715, 'f1-score': 0.7368369969228055, 'support': 2093}, 'weighted avg': {'precision': 0.9110053975264633, 'recall': 0.9182990922121357, 'f1-score': 0.9138095236865655, 'support': 2093}}
0.9182990922121357
0.9553641346906814
0.5183098591549297


In [None]:
# Used for saving any outputs we needed
new_df = pd.DataFrame(list(zip(preds)),
               columns =['Preds'])

In [None]:
# # Exporting saved outputs
# new_df.to_csv('/content/drive/MyDrive/NLP_cw/task1.csv')