<a href="https://colab.research.google.com/github/danieltannn/ITP_24/blob/main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers -U

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import pandas as pd

# Loading and Cleaning the dataset
# The cleaning process involves converting all the "sarc" and "nosarc" to 1 and 0 respectively
# The cleaning process also included a new header "label" and dropping the "class" header
df = pd.read_csv('GEN-sarc-notsarc.csv', usecols=['text', 'class'])
df['label'] = df['class'].apply(lambda x: 1 if x == 'sarc' else 0)
df = df.drop('class', axis=1)
df

Unnamed: 0,text,label
0,"If that's true, then Freedom of Speech is doom...",0
1,Neener neener - is it time to go in from the p...,0
2,"Just like the plastic gun fear, the armour pie...",0
3,So geology is a religion because we weren't he...,0
4,Well done Monty. Mark that up as your first ev...,0
...,...,...
6515,depends on when the baby bird died. run alon...,1
6516,"ok, sheesh, to clarify, women who arent aborti...",1
6517,so.. eh?? hows this sound? will it fly w...,1
6518,"I think we should put to a vote, the right of ...",1


In [4]:
# This code is to show that the amount of sarc and nosarc is equal at 3260 each 
df['label'].value_counts()

0    3260
1    3260
Name: label, dtype: int64

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
     

In [6]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')  
# Use GPU if available else use CPU
# Note: GPU usuage is only available for NVDIA gpu and not Radeon 

In [7]:
# Loading in the tokenizer instance that is pre-trained on bert-based-uncased model 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [8]:
# Splitting the dataset into train and validation set. 
# A random state is set to ensure that each time the function is called, the same splits are generated, this is useful for reproducibility and fairness in models comparing. 
x = list(df["text"])
y = list(df["label"])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

# Using the tokenizer instance to encode the text data into numerical representation so that it can be used as input to the BERT model
# The truncation parameter is set to True, meaning that the tokenizer will truncate any input that exceeds the maximum sequence length which is 512 char.
# The padding parameter is set to True, meaning that the tokenizer will add padding tokens to the end of any inputs that are shorter than the 512 char. 
train_encodings = tokenizer(x_train, padding=True, truncation=True, max_length=512)
test_encodings = tokenizer(x_test, padding=True, truncation=True, max_length=512)

In [9]:
# Create a PyTorch 'TensorDataset' instances for the training and testing data
# A TensorDataset is a PyTorch class that takes one or more PyTorch tensors as input, and returns a dataset that can be used for training or evaluation.
# In this code, I used the 'input_ids', 'attention_mask' and 'label values' for the training and testing data. 
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(train_encodings, y_train)
test_dataset = Dataset(test_encodings, y_test)

In [10]:
# Create a PyTorch 'DataLoader' instances for the training and testing data
# A DataLoader is a PyTorch class that provides an iterable over a dataset, allowing us to efficiently batch and load data during training or evaluation. 
# The 'shuffle' argument determines whether the data will be shuffled at the beginning of each epoch. 
# For training data, data will be shuffled however for testing, it is kept the same to ensure the same order each time it is loaded which is necessary for consistent evaluation 
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False) 

In [11]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)
# The reason I am using BertForSequenceClassification as it is a pre-trained BERT model that has been specifically fine-tuned for sequence classification tasks. 
# In the case of sarcasm detection, we want to classify each input text sequence as either sarcastic or not, so we will set num_labels to 2.
# Therefore, using it we can leverage the power of the pre-trained BERT model while fine-tuning it specifically for sarcasm detection. 

optimizer = AdamW(model.parameters(), lr=5e-5)
# The AdamW is a variant of the Adam optimizer that includes an additional term for weight decay, which helps to prevent overfitting

loss_fn = torch.nn.CrossEntropyLoss()
# During training, the function is used to calculate the loss between the predicted and target lables for each batch of data, and the backpropgate the loss through the model to update the model parameters. 
# The goal is to minimize the loss function over the entire training set, which will result in a model that can accurately classify sarcasm and non. 

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [12]:
# Utilising the Trainer class to simplify the training and evaluation process. 

# Define training arguments
training_args = TrainingArguments(
    output_dir='results',            # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy='epoch',     # evaluation strategy to adopt during training
    save_strategy='epoch',           # checkpoint save strategy
    load_best_model_at_end=True,     # load the best model when finished training
    metric_for_best_model='accuracy',# metric to use for best model selection
    greater_is_better=True           # direction of the metric for best model selection
)

# Defining metrices
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'accuracy': (preds == labels).mean(),
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# Define the trainer
trainer = Trainer(
    model=model,                     # the instantiated Transformers model to be trained
    args=training_args,              # training arguments, defined above
    train_dataset=train_dataset,     # training dataset
    eval_dataset=test_dataset,       # evaluation dataset
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()
trainer.evaluate()

Epoch,Training Loss,Validation Loss


In [None]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [None]:

def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
# Define Trainer with 10 epoch
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=10,
    per_device_train_batch_size=8

)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()



Step,Training Loss
500,0.547
1000,0.4071
1500,0.3275
2000,0.2336
2500,0.1144
3000,0.0612
3500,0.0493
4000,0.0205
4500,0.0063
5000,0.0066


<class 'transformers.trainer_utils.EvalPrediction'>


{'eval_loss': 2.042825698852539,
 'eval_accuracy': 0.7891104294478528,
 'eval_precision': 0.8095238095238095,
 'eval_recall': 0.7561349693251533,
 'eval_f1': 0.781919111816019,
 'eval_runtime': 20.9335,
 'eval_samples_per_second': 62.292,
 'eval_steps_per_second': 7.787,
 'epoch': 10.0}

In [None]:
# Define Trainer with 5 epoch 
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=5,
    per_device_train_batch_size=8

)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()




Step,Training Loss
500,0.1198
1000,0.0551
1500,0.0305
2000,0.0221
2500,0.0145
3000,0.0017


<class 'transformers.trainer_utils.EvalPrediction'>


{'eval_loss': 1.9916456937789917,
 'eval_accuracy': 0.7883435582822086,
 'eval_precision': 0.7965299684542587,
 'eval_recall': 0.7745398773006135,
 'eval_f1': 0.7853810264385691,
 'eval_runtime': 20.7549,
 'eval_samples_per_second': 62.828,
 'eval_steps_per_second': 7.854,
 'epoch': 5.0}

In [None]:
# Define Trainer with 3 epoch 
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=3,
    per_device_train_batch_size=8

)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()




Step,Training Loss
500,0.0596
1000,0.0342
1500,0.0069


<class 'transformers.trainer_utils.EvalPrediction'>


{'eval_loss': 2.187504291534424,
 'eval_accuracy': 0.7776073619631901,
 'eval_precision': 0.7882165605095541,
 'eval_recall': 0.75920245398773,
 'eval_f1': 0.7734375,
 'eval_runtime': 20.5039,
 'eval_samples_per_second': 63.598,
 'eval_steps_per_second': 7.95,
 'epoch': 3.0}