# <font color = 'indianred'> **1. Setting up the Environment** </font>



In [None]:
# If in Colab, then import the drive module from google.colab
if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  # Mount the Google Drive to access files stored there
  drive.mount('/content/drive')

  # Install the latest version of torchtext library quietly without showing output
  !pip install torchtext -qq
  !pip install transformers evaluate wandb datasets accelerate -U -qq ## NEW LINES ##
  basepath = '/content/drive/MyDrive/data/'
else:
  basepath = '/home/harpreet/Insync/google_drive_shaannorr/data'

Mounted at /content/drive
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m69.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m


<font color = 'indianred'> *Load Libraries* </font>

In [None]:
# Importing PyTorch library for tensor computations and neural network modules
import torch
import torch.nn as nn

# For working with textual data vocabularies and for displaying model summaries
from torchtext.vocab import vocab

# General-purpose Python libraries for random number generation and numerical operations
import random
import numpy as np

# Utilities for efficient serialization/deserialization of Python objects and for element tallying
import joblib
from collections import Counter

# For creating lightweight attribute classes and for partial function application
from functools import partial

# For filesystem path handling, generating and displaying confusion matrices, and date-time manipulations
from pathlib import Path
from sklearn.metrics import confusion_matrix
from datetime import datetime

# For plotting and visualization
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline

# imports from Huggingface ecosystem
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import PreTrainedModel, PretrainedConfig
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import evaluate

# wandb library
import wandb

# <font color = 'indianred'> **2. Load Data set**
    


In [None]:
base_folder = Path(basepath)
data_folder = base_folder / 'datasets/my_project'
model_folder = base_folder / 'models/nlp_spring_2024/my_project'

In [None]:
model_folder.mkdir(exist_ok=True, parents=True)
data_folder.mkdir(exist_ok=True, parents=True)

In [None]:
# Load train data
import pandas as pd
train_data = pd.read_csv("/content/drive/MyDrive/data/datasets/train.csv")


# <font color = 'indianred'> **3. Accessing and Manuplating Splits**</font>



In [None]:
# Split features and labels
y = train_data[['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love','optimism', 'pessimism', 'sadness', 'surprise', 'trust']].values.astype(float)
X = train_data.drop(columns=['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love','optimism', 'pessimism', 'sadness', 'surprise', 'trust'])

# Split data into train, test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Further split train data into train and validation
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=1)  # 0.25 x 0.8 = 0.2


<font color = 'indianred'>*Extract Splits*

In [None]:
# Convert X_train, X_valid to lists
X_train_list = list(X_train['Tweet'])
X_valid_list = list(X_valid['Tweet'])
X_test_list = list(X_test['Tweet'])

<font color = 'indianred'>*Create futher subdivions of the splits*</font>

In [None]:
# Create Datasets
train_set = Dataset.from_dict({
    'texts': X_train_list,
    'labels': y_train
})

valid_set = Dataset.from_dict({
    'texts': X_valid_list,
    'labels': y_valid
})

test_set = Dataset.from_dict({
    'texts': X_test_list,
    'labels': y_test
})

In [None]:
train_set

Dataset({
    features: ['texts', 'labels'],
    num_rows: 4634
})

In [None]:


# Print the shapes of train, validation, and test sets
print("Train set shape:", len(train_set))
print("Validation set shape:", len(valid_set))
print("Test set shape:", len(test_set))


Train set shape: 4634
Validation set shape: 1545
Test set shape: 1545


In [None]:

# Create DatasetDict
from datasets import DatasetDict

data_dict = DatasetDict({
    "train": train_set,
    "valid": valid_set
})

In [None]:

# New libraries introduced in this notebook
import evaluate
from datasets import load_dataset, DatasetDict
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import AutoConfig
from transformers import pipeline
import wandb

# <font color = 'indianred'>**4. Load pre-trained Tokenizer**</font>



In [None]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

#<font color = 'indianred'> **5. Create function for Tokenizer**



In [None]:
# Define a function to tokenize a batch

def tokenize_batch(batch):
    return tokenizer(text = batch["texts"], truncation=True, padding=True, return_tensors="pt")

<font color = 'indianred'> *Use map function to apply tokenization to all splits*

In [None]:
tokenized_dataset= data_dict.map(tokenize_batch, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(
    ['texts']
)
tokenized_dataset.set_format(type='torch')

Map:   0%|          | 0/4634 [00:00<?, ? examples/s]

Map:   0%|          | 0/1545 [00:00<?, ? examples/s]

#  <font color = 'indianred'> **6. Model Training**

##  <font color = 'indianred'> **6.1 Download pre-trained model**

##  <font color = 'indianred'> **6.2 Downaload and Modify Model Config File**

In [None]:
from transformers import AutoModelForSequenceClassification, AutoConfig

# Instantiate the model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=11)  # Adjust num_labels according to your task
# Load the configuration
config = AutoConfig.from_pretrained(checkpoint)

# Get class names from the dataset
class_names = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

# Create id to label and label to id mappings
id2label = {id_: label_ for id_, label_ in enumerate(class_names)}
label2id = {label_: id_ for id_, label_ in enumerate(class_names)}

# Update the configuration with mappings
config.id2label = id2label
config.label2id = label2id

# Associate the updated configuration with the model
model.config = config


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
config.problem_type='multi_label_classification'

##  <font color = 'indianred'> **6.3 compute_metrics function** </font>



In [None]:
def compute_metrics(eval_pred):
    combined_metrics = evaluate.combine([evaluate.load("accuracy"),
                                        evaluate.load("f1", average="macro")])

    logits, labels = eval_pred
    predictions = (logits >= 0).astype(int)
    predictions = predictions.reshape(-1)
    labels = labels.reshape(-1)
    evaluations = combined_metrics.compute(
        predictions=predictions, references=labels,)
    return evaluations

In [None]:
config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "anger",
    "1": "anticipation",
    "2": "disgust",
    "3": "fear",
    "4": "joy",
    "5": "love",
    "6": "optimism",
    "7": "pessimism",
    "8": "sadness",
    "9": "surprise",
    "10": "trust"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "anger": 0,
    "anticipation": 1,
    "disgust": 2,
    "fear": 3,
    "joy": 4,
    "love": 5,
    "optimism": 6,
    "pessimism": 7,
    "sadness": 8,
    "surprise": 9,
    "trust": 10
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "probl

In [None]:
#!pip install accelerate>=0.21.0
#!pip install transformers[torch]

## <font color = 'indianred'> **6.4 Training Arguments**</font>







In [None]:
from pathlib import Path
from transformers import TrainingArguments, Trainer

# Define the directory where model checkpoints will be saved
run_name = "bert_base_uncased_final"
base_folder = Path("/content/drive/MyDrive/data")
model_folder = base_folder / "models" / run_name
model_folder.mkdir(exist_ok=True, parents=True)

# Configure training parameters
training_args = TrainingArguments(
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    learning_rate=2e-5,
    optim='adamw_torch',
    output_dir=str(model_folder),
    evaluation_strategy='steps',
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    save_total_limit=2,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    logging_strategy='steps',
    logging_steps=100,
    report_to='wandb',
    run_name=run_name,
    fp16=True
)




##  <font color = 'indianred'> **6.5 Initialize Trainer**</font>



In [None]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


## <font color = 'indianred'> **6.6 Setup WandB**</font>

In [None]:
wandb.login()
%env WANDB_PROJECT = homework_6_sentiment_analysis

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 69


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 69


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


env: WANDB_PROJECT=homework_6_sentiment_analysis


##  <font color = 'indianred'> **6.7 Training and Validation**

In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mlikith-gv[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,F1
100,0.4856,0.414834,0.833657,0.468709
200,0.4054,0.379332,0.844719,0.588364
300,0.3563,0.357289,0.854192,0.604532
400,0.326,0.341522,0.862136,0.6253
500,0.3114,0.334535,0.86702,0.646544
600,0.3154,0.329136,0.868726,0.653087
700,0.285,0.32613,0.870197,0.65205
800,0.2768,0.32625,0.869256,0.65731


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

TrainOutput(global_step=870, training_loss=0.34044425131260664, metrics={'train_runtime': 328.1318, 'train_samples_per_second': 42.367, 'train_steps_per_second': 2.651, 'total_flos': 506762006091912.0, 'train_loss': 0.34044425131260664, 'epoch': 3.0})

<font color = 'indianred'> *Evaluate model on Validation Set* </font>


In [None]:
eval_results = trainer.evaluate(tokenized_dataset["valid"])

In [None]:
eval_results

{'eval_loss': 0.3261297345161438,
 'eval_accuracy': 0.8701971167990585,
 'eval_f1': 0.6520504731861199,
 'eval_runtime': 7.6515,
 'eval_samples_per_second': 201.922,
 'eval_steps_per_second': 12.677,
 'epoch': 3.0}

In [None]:
wandb.log({"eval_accuracy": eval_results["eval_accuracy"], "eval_loss": eval_results["eval_loss"], "eval_f1": eval_results["eval_f1"]})

In [None]:
valid_output = trainer.predict(tokenized_dataset["valid"])

In [None]:
valid_preds = np.argmax(valid_output.predictions, axis=1)
valid_labels = np.array(valid_output.label_ids)


<font color = 'indianred'> *Get best checkpoint*</font>


In [None]:
# After training, let us check the best checkpoint
# We need this for Inference
best_model_checkpoint_step = trainer.state.best_model_checkpoint.split('-')[-1]
print(f"The best model was saved at step {best_model_checkpoint_step}.")


The best model was saved at step 700.


#  <font color = 'indianred'> **7. Performance on Test Set** </font>
For test set evaluation, we will perfrom following steps:



In [None]:
checkpoint = str(model_folder/f'checkpoint-{best_model_checkpoint_step}')
checkpoint

'/content/drive/MyDrive/data/models/bert_base_uncased_final/checkpoint-700'

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
test_set_tokenized = test_set.map(tokenize_batch, batched=True)


Map:   0%|          | 0/1545 [00:00<?, ? examples/s]

<font color = 'indianred'>*Training Arguments*</font>

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=128,
    do_train=False,
    do_eval=True,
    report_to=[],
    run_name=run_name
)

<font color = 'indianred'>*Instantiate Trainer*</font>

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_set_tokenized,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


<font color = 'indianred'>*Evaluate using Trainer*</font>

In [None]:
test_results= trainer.evaluate()

In [None]:
test_results

{'eval_loss': 0.32572656869888306,
 'eval_accuracy': 0.8697263901147396,
 'eval_f1': 0.6470025510204082,
 'eval_runtime': 7.0195,
 'eval_samples_per_second': 220.102,
 'eval_steps_per_second': 1.852}

In [None]:
wandb.log({"test_accuracy": test_results["eval_accuracy"], "test_loss": test_results["eval_loss"], "test_f1": test_results["eval_f1"]})


In [None]:
test_output = trainer.predict(test_set_tokenized)

In [None]:
test_preds = np.argmax(test_output.predictions, axis=1)
test_labels = np.array(test_output.label_ids)


In [None]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▃▅▆▇████
eval/f1,▁▅▆▇█████
eval/loss,█▅▃▂▂▁▁▁▁
eval/runtime,▃▁▂▆▄█▅▄▄
eval/samples_per_second,▆█▆▃▅▁▄▅▅
eval/steps_per_second,▆█▆▃▅▁▄▅▅
eval_accuracy,▁
eval_f1,▁
eval_loss,▁
test_accuracy,▁

0,1
eval/accuracy,0.8702
eval/f1,0.65205
eval/loss,0.32613
eval/runtime,7.6515
eval/samples_per_second,201.922
eval/steps_per_second,12.677
eval_accuracy,0.8702
eval_f1,0.65205
eval_loss,0.32613
test_accuracy,0.86973


# <Font color = 'indianred'> **8. Model Inference**



In [None]:
checkpoint = str(model_folder/f'checkpoint-{best_model_checkpoint_step}')
custom_pipeline = pipeline(
    task="text-classification",
    model=checkpoint,
    tokenizer=checkpoint,
    device=0)

<font color = 'indianred'> **Test the pipeline on a single example**

In [None]:
sample = test_set['texts'][0]
preds = custom_pipeline(sample)
preds


[{'label': 'joy', 'score': 0.903987467288971}]

<font color = 'indianred'> **Test the pipeline on a large dataset**

In [None]:
predictions = custom_pipeline(test_set['texts'], truncation=True)

In [None]:
predictions[0:10]


[{'label': 'joy', 'score': 0.903987467288971},
 {'label': 'anger', 'score': 0.9121595621109009},
 {'label': 'joy', 'score': 0.7417725920677185},
 {'label': 'disgust', 'score': 0.785866379737854},
 {'label': 'sadness', 'score': 0.8577322959899902},
 {'label': 'disgust', 'score': 0.822230875492096},
 {'label': 'disgust', 'score': 0.8129914402961731},
 {'label': 'joy', 'score': 0.9575889110565186},
 {'label': 'joy', 'score': 0.4090001881122589},
 {'label': 'optimism', 'score': 0.7771042585372925}]