# <font color = 'indianred'> **1. Setting up the Environment** </font>



In [1]:
# If in Colab, then import the drive module from google.colab
if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  # Mount the Google Drive to access files stored there
  drive.mount('/content/drive')

  # Install the latest version of torchtext library quietly without showing output
  !pip install torchtext -qq
  !pip install transformers evaluate wandb datasets accelerate -U -qq ## NEW LINES ##
  basepath = '/content/drive/MyDrive/data/'
else:
  basepath = '/home/harpreet/Insync/google_drive_shaannorr/data'

Mounted at /content/drive
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m66.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m


<font color = 'indianred'> *Load Libraries* </font>

In [2]:
# Importing PyTorch library for tensor computations and neural network modules
import torch
import torch.nn as nn

# For working with textual data vocabularies and for displaying model summaries
from torchtext.vocab import vocab

# General-purpose Python libraries for random number generation and numerical operations
import random
import numpy as np

# Utilities for efficient serialization/deserialization of Python objects and for element tallying
import joblib
from collections import Counter

# For creating lightweight attribute classes and for partial function application
from functools import partial

# For filesystem path handling, generating and displaying confusion matrices, and date-time manipulations
from pathlib import Path
from sklearn.metrics import confusion_matrix
from datetime import datetime

# For plotting and visualization
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline
#!pip install datasets

# imports from Huggingface ecosystem
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import PreTrainedModel, PretrainedConfig
from transformers import TrainingArguments, Trainer
from datasets import Dataset
#!pip install evaluate
import evaluate

# wandb library
#!pip install wandb
import wandb

# <font color = 'indianred'> **2. Load Data set**
    


In [3]:
base_folder = Path(basepath)
data_folder = base_folder / 'datasets/my_project'
model_folder = base_folder / 'models/nlp_spring_2024/my_project'

In [4]:
model_folder.mkdir(exist_ok=True, parents=True)
data_folder.mkdir(exist_ok=True, parents=True)

In [5]:
# Load train data
import pandas as pd
train_data = pd.read_csv("/content/drive/MyDrive/data/datasets/train.csv")


# <font color = 'indianred'> **3. Accessing and Manuplating Splits**</font>



In [6]:
# Split features and labels
y = train_data[['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love','optimism', 'pessimism', 'sadness', 'surprise', 'trust']].values.astype(float)
X = train_data.drop(columns=['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love','optimism', 'pessimism', 'sadness', 'surprise', 'trust'])

# Split data into train, test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Further split train data into train and validation
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=1)  # 0.25 x 0.8 = 0.2


<font color = 'indianred'>*Extract Splits*

In [7]:
# Convert X_train, X_valid to lists
X_train_list = list(X_train['Tweet'])
X_valid_list = list(X_valid['Tweet'])
X_test_list = list(X_test['Tweet'])

<font color = 'indianred'>*Create futher subdivions of the splits*</font>

In [8]:
# Create Datasets
train_set = Dataset.from_dict({
    'texts': X_train_list,
    'labels': y_train
})

valid_set = Dataset.from_dict({
    'texts': X_valid_list,
    'labels': y_valid
})

test_set = Dataset.from_dict({
    'texts': X_test_list,
    'labels': y_test
})

In [9]:
train_set

Dataset({
    features: ['texts', 'labels'],
    num_rows: 4634
})

In [10]:
# Print the shapes of train, validation, and test sets
print("Train set shape:", len(train_set))
print("Validation set shape:", len(valid_set))
print("Test set shape:", len(test_set))


Train set shape: 4634
Validation set shape: 1545
Test set shape: 1545


In [11]:
# Create DatasetDict
from datasets import DatasetDict

data_dict = DatasetDict({
    "train": train_set,
    "valid": valid_set
})

In [13]:

# New libraries introduced in this notebook
import evaluate
from datasets import load_dataset, DatasetDict
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import AutoConfig
from transformers import pipeline
import wandb

# <font color = 'indianred'>**4. Load pre-trained Tokenizer**</font>



In [14]:
checkpoint = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

#<font color = 'indianred'> **5. Create function for Tokenizer**



In [15]:
# Define a function to tokenize a batch

def tokenize_batch(batch):
    return tokenizer(text = batch["texts"], truncation=True, padding=True, return_tensors="pt")

<font color = 'indianred'> *Use map function to apply tokenization to all splits*

In [16]:
tokenized_dataset= data_dict.map(tokenize_batch, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(
    ['texts']
)
tokenized_dataset.set_format(type='torch')

Map:   0%|          | 0/4634 [00:00<?, ? examples/s]

Map:   0%|          | 0/1545 [00:00<?, ? examples/s]

#  <font color = 'indianred'> **6. Model Training**

##  <font color = 'indianred'> **6.1 Download pre-trained model**

##  <font color = 'indianred'> **6.2 Downaload and Modify Model Config File**

In [17]:
from transformers import AutoModelForSequenceClassification, AutoConfig

# Instantiate the model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=11)  # Adjust num_labels according to your task
# Load the configuration
config = AutoConfig.from_pretrained(checkpoint)

# Get class names from the dataset
class_names = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

# Create id to label and label to id mappings
id2label = {id_: label_ for id_, label_ in enumerate(class_names)}
label2id = {label_: id_ for id_, label_ in enumerate(class_names)}

# Update the configuration with mappings
config.id2label = id2label
config.label2id = label2id

# Associate the updated configuration with the model
model.config = config


model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
config.problem_type='multi_label_classification'

##  <font color = 'indianred'> **6.3 compute_metrics function** </font>



In [19]:
def compute_metrics(eval_pred):
    combined_metrics = evaluate.combine([evaluate.load("accuracy"),
                                        evaluate.load("f1", average="macro")])

    logits, labels = eval_pred
    predictions = (logits >= 0).astype(int)
    predictions = predictions.reshape(-1)
    labels = labels.reshape(-1)
    evaluations = combined_metrics.compute(
        predictions=predictions, references=labels,)
    return evaluations

In [20]:
config

RobertaConfig {
  "_name_or_path": "distilroberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "anger",
    "1": "anticipation",
    "2": "disgust",
    "3": "fear",
    "4": "joy",
    "5": "love",
    "6": "optimism",
    "7": "pessimism",
    "8": "sadness",
    "9": "surprise",
    "10": "trust"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "anger": 0,
    "anticipation": 1,
    "disgust": 2,
    "fear": 3,
    "joy": 4,
    "love": 5,
    "optimism": 6,
    "pessimism": 7,
    "sadness": 8,
    "surprise": 9,
    "trust": 10
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "abs

In [21]:
#!pip install accelerate>=0.21.0
#!pip install transformers[torch]

## <font color = 'indianred'> **6.4 Training Arguments**</font>







In [22]:
from pathlib import Path
from transformers import TrainingArguments, Trainer

# Define the directory where model checkpoints will be saved
run_name = "distilroberta_base_final"
base_folder = Path("/content/drive/MyDrive/data")
model_folder = base_folder / "models" / run_name
model_folder.mkdir(exist_ok=True, parents=True)

# Configure training parameters
training_args = TrainingArguments(
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    learning_rate=2e-5,
    optim='adamw_torch',
    output_dir=str(model_folder),
    evaluation_strategy='steps',
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    save_total_limit=2,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    logging_strategy='steps',
    logging_steps=100,
    report_to='wandb',
    run_name=run_name,
    fp16=True
)




##  <font color = 'indianred'> **6.5 Initialize Trainer**</font>



In [23]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


## <font color = 'indianred'> **6.6 Setup WandB**</font>

In [24]:
wandb.login()
%env WANDB_PROJECT = homework_6_sentiment_analysis

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


env: WANDB_PROJECT=homework_6_sentiment_analysis


##  <font color = 'indianred'> **6.7 Training and Validation**

In [25]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mlikith-gv[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,F1
100,0.4832,0.410498,0.823477,0.418154
200,0.4067,0.37821,0.842012,0.580927
300,0.3611,0.361677,0.852486,0.599968
400,0.3362,0.353776,0.856546,0.614241
500,0.3282,0.345139,0.859841,0.626997
600,0.3332,0.337257,0.864195,0.641615
700,0.3064,0.33344,0.867549,0.649322
800,0.2996,0.335433,0.86496,0.648707


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

TrainOutput(global_step=870, training_loss=0.3525390252299692, metrics={'train_runtime': 215.7717, 'train_samples_per_second': 64.429, 'train_steps_per_second': 4.032, 'total_flos': 276749661921516.0, 'train_loss': 0.3525390252299692, 'epoch': 3.0})

<font color = 'indianred'> *Evaluate model on Validation Set* </font>


In [26]:
eval_results = trainer.evaluate(tokenized_dataset["valid"])

In [27]:
eval_results

{'eval_loss': 0.3334404230117798,
 'eval_accuracy': 0.8675492791997647,
 'eval_f1': 0.6493223243495871,
 'eval_runtime': 5.3189,
 'eval_samples_per_second': 290.474,
 'eval_steps_per_second': 18.237,
 'epoch': 3.0}

In [28]:
wandb.log({"eval_accuracy": eval_results["eval_accuracy"], "eval_loss": eval_results["eval_loss"], "eval_f1": eval_results["eval_f1"]})

In [29]:
valid_output = trainer.predict(tokenized_dataset["valid"])

In [30]:
valid_preds = np.argmax(valid_output.predictions, axis=1)
valid_labels = np.array(valid_output.label_ids)


<font color = 'indianred'> *Get best checkpoint*</font>


In [31]:
# After training, let us check the best checkpoint
# We need this for Inference
best_model_checkpoint_step = trainer.state.best_model_checkpoint.split('-')[-1]
print(f"The best model was saved at step {best_model_checkpoint_step}.")


The best model was saved at step 700.


#  <font color = 'indianred'> **7. Performance on Test Set** </font>
For test set evaluation, we will perfrom following steps:



In [32]:
checkpoint = str(model_folder/f'checkpoint-{best_model_checkpoint_step}')
checkpoint

'/content/drive/MyDrive/data/models/distilroberta_base_final/checkpoint-700'

In [33]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [34]:
test_set_tokenized = test_set.map(tokenize_batch, batched=True)


Map:   0%|          | 0/1545 [00:00<?, ? examples/s]

<font color = 'indianred'>*Training Arguments*</font>

In [35]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=128,
    do_train=False,
    do_eval=True,
    report_to=[],
    run_name=run_name
)

<font color = 'indianred'>*Instantiate Trainer*</font>

In [36]:
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_set_tokenized,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


<font color = 'indianred'>*Evaluate using Trainer*</font>

In [37]:
test_results= trainer.evaluate()

In [38]:
test_results

{'eval_loss': 0.3295323848724365,
 'eval_accuracy': 0.8657252132980289,
 'eval_f1': 0.6394944707740916,
 'eval_runtime': 7.621,
 'eval_samples_per_second': 202.73,
 'eval_steps_per_second': 1.706}

In [39]:
wandb.log({"test_accuracy": test_results["eval_accuracy"], "test_loss": test_results["eval_loss"], "test_f1": test_results["eval_f1"]})


In [40]:
test_output = trainer.predict(test_set_tokenized)

In [41]:
test_preds = np.argmax(test_output.predictions, axis=1)
test_labels = np.array(test_output.label_ids)


In [42]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▄▆▆▇▇███
eval/f1,▁▆▇▇▇████
eval/loss,█▅▄▃▂▁▁▁▁
eval/runtime,█▁▄▄▂▂▄▁▄
eval/samples_per_second,▁█▅▅▇▆▅█▅
eval/steps_per_second,▁█▅▅▇▆▅█▅
eval_accuracy,▁
eval_f1,▁
eval_loss,▁
test_accuracy,▁

0,1
eval/accuracy,0.86755
eval/f1,0.64932
eval/loss,0.33344
eval/runtime,5.3189
eval/samples_per_second,290.474
eval/steps_per_second,18.237
eval_accuracy,0.86755
eval_f1,0.64932
eval_loss,0.33344
test_accuracy,0.86573


# <Font color = 'indianred'> **8. Model Inference**



In [43]:
checkpoint = str(model_folder/f'checkpoint-{best_model_checkpoint_step}')
custom_pipeline = pipeline(
    task="text-classification",
    model=checkpoint,
    tokenizer=checkpoint,
    device=0)

<font color = 'indianred'> **Test the pipeline on a single example**

In [44]:
sample = test_set['texts'][0]
preds = custom_pipeline(sample)
preds


[{'label': 'joy', 'score': 0.8089097738265991}]

<font color = 'indianred'> **Test the pipeline on a large dataset**

In [45]:
predictions = custom_pipeline(test_set['texts'], truncation=True)

In [46]:
predictions[0:10]


[{'label': 'joy', 'score': 0.8089097738265991},
 {'label': 'anger', 'score': 0.7798399329185486},
 {'label': 'optimism', 'score': 0.6998541951179504},
 {'label': 'disgust', 'score': 0.7801336050033569},
 {'label': 'sadness', 'score': 0.9194594025611877},
 {'label': 'disgust', 'score': 0.7292709350585938},
 {'label': 'anger', 'score': 0.8774198889732361},
 {'label': 'joy', 'score': 0.956801176071167},
 {'label': 'joy', 'score': 0.564007043838501},
 {'label': 'optimism', 'score': 0.7397385239601135}]