#**DeBERTa Model**

### Installing required libraries

In [None]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [3]:
from transformers import AutoTokenizer
from datasets import Dataset
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [4]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [5]:
import torch
from torch.utils.checkpoint import checkpoint
import torch.nn as nn

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [25]:
torch.cuda.get_device_name(0)

'Tesla T4'

### Define configuration

[Models](https://huggingface.co/models)

In [7]:
model_name = "microsoft/deberta-v3-base"

### Prepare data

Get data and apply simple normalisation if necessary

In [9]:
df = pd.read_csv("../data/sample/news_headlines_test_submission.csv")
df

Unnamed: 0,text,sentiment
0,"In addition , a further 29 employees can be la...",-1
1,The authorisation is in force until the end of...,0
2,The value of the deal was not disclosed .,0
3,You need to be ready when the window opens up ...,0
4,Major Order in India Comptel Corporation has r...,1
...,...,...
3188,The Insolvency Act regulates the amount of deb...,0
3189,We have also cut our price projections for pap...,-1
3190,"Tyrvaan Sanomat , published twice a week by Ty...",0
3191,"pct lower at 4,442.10 .",0


In [10]:
def normalise(text):
    text = text.lower()
    return text

df['text'] = df['text'].apply(normalise)

Get tokeniser (for NSP)

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 512

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Split into train-valid

In [12]:
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["sentiment"])


### Create tokenised dataset

In [13]:
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)

In [14]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.shuffle(seed=42).map(tokenize_function, batched=True)
tokenized_test_dataset = valid_dataset.shuffle(seed=42).map(tokenize_function, batched=True)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [15]:
tokenized_train_dataset

Dataset({
    features: ['text', 'sentiment', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2554
})

### Define Model

Dynamic Padding

In [16]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Model

In [17]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

Downloading pytorch_model.bin:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

Training arguments

In [18]:
MODEL_PATH = "./results" 

# training_args = TrainingArguments(
#     MODEL_PATH,
#     num_train_epochs=2,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     warmup_ratio=0.1, 
#     lr_scheduler_type='cosine',
#     # Optimising
#     auto_find_batch_size=True,
#     # The num of workers may vary for different machines, if you are not sure, just comment this line out
#     dataloader_num_workers=2,
#     gradient_accumulation_steps=4,
#     fp16=True,
# )

train_args = TrainingArguments(
    MODEL_PATH,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio = 0.1,
    evaluation_strategy='epoch',
)

Combating class imbalance with class weights

In [20]:
# Calculating the weights
# Weightage = 1 - (num_of_samples_of_class)/(total_num_of_samples)
# less samples, more weightage

w_neg = 1-len(df[df['sentiment'] == -1])/len(train_df)
w_neu = 1-len(df[df['sentiment'] == 0])/len(train_df)
w_pos = 1-len(df[df['sentiment'] == 1])/len(train_df)

class_weights = torch.tensor(
    [w_neg, w_neu, w_pos]
).cuda()

class_weights

tensor([0.8485, 0.2569, 0.6445], device='cuda:0')

Define Trainer

In [21]:
# override the compute_loss function of the Trainer and introduce our class weighgts
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        # Class weighting
        loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [22]:
from sklearn.metrics import accuracy_score

def evaluation(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, preds)}

In [23]:
trainer = CustomTrainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=evaluation,
)

### Train Model

In [24]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: sentiment, text, __index_level_0__. If sentiment, text, __index_level_0__ are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2554
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 800


RuntimeError: ignored