# Dependencies

In [None]:
# This script is intended to be executed in a Google Colab environment.

# Import the necessary library to mount Google Drive to this Colab instance.
from google.colab import drive

# Mount the Google Drive to access files stored there.
drive.mount('/content/drive')

# Import the sys library to manipulate the Python path.
import sys

# Define the folder name where the file is stored.
foldername = '/WandB/week_3/'

# Append the specific directory to the Python path.
sys.path.append(f'/content/drive/My Drive/{foldername}')

# Change the working directory to the specified folder.
%cd /content/drive/My\ Drive/$foldername

In [None]:
# Install necessary packages
!pip3 install wandb transformers[sentencepiece] datasets evaluate 
!pip install ml_collections

# Import libraries
import wandb, transformers, torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Set device (CPU or GPU)
device = "cuda" if torch.cuda.is_available() else "cpu"

# Enable autoreloading of modules
%load_ext autoreload
%autoreload 2

# Import required functions and classes
from datasets import load_from_disk, load_dataset, Dataset, load_metric
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

import evaluate
from transformers.trainer_callback import (EarlyStoppingCallback, 
                                           TrainerCallback)
import params
from huggingface_hub import notebook_login

# Import scikit-learn function for splitting data
from sklearn.model_selection import GroupShuffleSplit

# Import os library
import os

from params import default_cfg as cfg

# Log in to the Hugging Face Hub
from huggingface_hub import notebook_login
notebook_login()

# Data Analysis

In [None]:
# Initialize a W&B run for analyzing the final results of the model training.
run = wandb.init(project="mlops-course-assgn3", job_type='Analysis_V3')

# Use the specified artifact for the model.
artifact = run.use_artifact('dmeltzer/mlops-course-assgn3/model-mv0ofc09:v0', type='model')

# Download the artifact and store it in a directory.
artifact_dir = artifact.download()

In [None]:
# Load the pre-trained model for sequence classification from the specified artifact directory.
model = AutoModelForSequenceClassification.from_pretrained(
    artifact_dir,
    num_labels=cfg.NUM_CLASSES)

# Load the tokenizer associated with the pre-trained model from the same artifact directory.
tokenizer = AutoTokenizer.from_pretrained(
    artifact_dir
)

# Push the loaded model to the Hugging Face Model Hub using the provided model ID from configuration.
model.push_to_hub(cfg.HUB_MODEL_ID)

# Push the loaded tokenizer to the Hugging Face Model Hub using the same model ID.
tokenizer.push_to_hub(cfg.HUB_MODEL_ID)

In [None]:
# Load the latest version of the test dataset artifact.
test_artifact = run.use_artifact(f"{cfg.TEST_DATA_ARTIFACT}:latest")

# Download the test dataset to the specified root folder.
test_artifact.download(root=cfg.TEST_DATA_FOLDER)

# Load the downloaded test dataset.
test_dataset = load_from_disk(cfg.TEST_DATA_FOLDER)

# Load the latest version of the valid dataset artifact.
valid_dataset = run.use_artifact(f"{cfg.VALID_DATA_ARTIFACT}:latest")

# Download the valid dataset to the specified root folder.
valid_dataset.download(root=cfg.VALID_DATA_FOLDER)

# Load the downloaded valid dataset.
valid_dataset = load_from_disk(cfg.VALID_DATA_FOLDER)

# Load the latest version of the train dataset artifact.
train_dataset = run.use_artifact(f"{cfg.TRAIN_DATA_ARTIFACT}:latest")

# Download the train dataset to the specified root folder.
train_dataset.download(root=cfg.TRAIN_DATA_FOLDER)

# Load the downloaded train dataset.
train_dataset = load_from_disk(cfg.TRAIN_DATA_FOLDER)

# Identify and drop columns from the datasets that are not required for the model.
drop_cols = [col for col in list(test_dataset.features) if col not in ['input_ids','attention_mask','rating']]

# Remove the identified columns from the test dataset.
test_dataset = test_dataset.remove_columns(drop_cols)

# Remove the identified columns from the valid dataset.
valid_dataset = valid_dataset.remove_columns(drop_cols)

# Remove the identified columns from the train dataset.
train_dataset = train_dataset.remove_columns(drop_cols)

# Rename the 'rating' column to 'labels' in all datasets to align with model requirements.
test_dataset = test_dataset.rename_column('rating','labels')
valid_dataset = valid_dataset.rename_column('rating','labels')
train_dataset = train_dataset.rename_column('rating','labels')

# Set the format of the datasets to 'torch' for specified columns.
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
valid_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
# Create a data collator for padding tokens using the provided tokenizer.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Import the DataLoader module from torch.utils.data.
from torch.utils.data import DataLoader

# Create a DataLoader for the training dataset with specified batch size and collate function.
train_dataloader = DataLoader(
    train_dataset,
    batch_size=32,  # Set the batch size to 32.
    collate_fn=data_collator  # Use the created data collator for padding.
)

# Create a DataLoader for the validation dataset with specified batch size and collate function.
valid_dataloader = DataLoader(
    valid_dataset,
    batch_size=32,  # Set the batch size to 32.
    collate_fn=data_collator  # Use the created data collator for padding.
)

# Create a DataLoader for the test dataset with specified batch size and collate function.
test_dataloader = DataLoader(
    test_dataset,
    batch_size=32,  # Set the batch size to 32.
    collate_fn=data_collator  # Use the created data collator for padding.
)

In [None]:
# Move the model to the specified device (CPU or GPU).
model.to(device)

# Set the model in evaluation mode.
model.eval()

# Load the specified metric for evaluation (e.g., accuracy).
metric = evaluate.load("accuracy")

# Initialize empty tensors for storing predictions and labels on the training set.
train_predictions = torch.tensor([]).to(device)
train_labels = torch.tensor([]).to(device)

# Iterate over batches in the training data loader.
for batch in train_dataloader:
    # Move batch tensors to the specified device.
    batch = {k: v.to(device) for k, v in batch.items()}
    
    # Disable gradient calculation during validation.
    with torch.no_grad():
        outputs = model(**batch)
    
    # Extract logits from the model outputs.
    logits = outputs.logits
    
    # Predict class labels by selecting the argmax of the logits.
    predictions = torch.argmax(logits, dim=-1)
    
    # Update the metric with the batch predictions and reference labels.
    metric.add_batch(predictions=predictions, references=batch['labels'])

    # Concatenate batch predictions and labels to the overall tensors.
    train_predictions = torch.cat((train_predictions, predictions))
    train_labels = torch.cat((train_labels, batch['labels']))

# Compute the training accuracy.
train_acc = metric.compute()

# Log the training accuracy to Weights and Biases.
wandb.log(train_acc)

# Return the training accuracy.
train_acc

In [None]:
# Move the model to the specified device (CPU or GPU).
model.to(device)

# Set the model in evaluation mode.
model.eval()

# Load the specified metric for evaluation (e.g., accuracy).
metric = evaluate.load("accuracy")

# Initialize empty tensors for storing predictions and labels on the validation set.
valid_predictions = torch.tensor([]).to(device)
valid_labels = torch.tensor([]).to(device)

# Iterate over batches in the validation data loader.
for batch in valid_dataloader:
    # Move batch tensors to the specified device.
    batch = {k: v.to(device) for k, v in batch.items()}
    
    # Disable gradient calculation during validation.
    with torch.no_grad():
        outputs = model(**batch)
    
    # Extract logits from the model outputs.
    logits = outputs.logits
    
    # Predict class labels by selecting the argmax of the logits.
    predictions = torch.argmax(logits, dim=-1)
    
    # Update the metric with the batch predictions and reference labels.
    metric.add_batch(predictions=predictions, references=batch['labels'])

    # Concatenate batch predictions and labels to the overall tensors.
    valid_predictions = torch.cat((valid_predictions, predictions))
    valid_labels = torch.cat((valid_labels, batch['labels']))

# Compute the validation accuracy.
validation_acc = metric.compute()

# Log the validation accuracy to Weights and Biases.
wandb.log(validation_acc)

# Return the validation accuracy.
validation_acc

In [None]:
# Move the model to the specified device (CPU or GPU).
model.to(device)

# Set the model in evaluation mode.
model.eval()

# Load the specified metric for evaluation (e.g., accuracy).
metric = evaluate.load("accuracy")

# Initialize empty tensors for storing predictions and labels on the test set.
test_predictions = torch.tensor([]).to(device)
test_labels = torch.tensor([]).to(device)

# Iterate over batches in the test data loader.
for batch in test_dataloader:
    # Move batch tensors to the specified device.
    batch = {k: v.to(device) for k, v in batch.items()}
    
    # Disable gradient calculation during validation.
    with torch.no_grad():
        outputs = model(**batch)
    
    # Extract logits from the model outputs.
    logits = outputs.logits
    
    # Predict class labels by selecting the argmax of the logits.
    predictions = torch.argmax(logits, dim=-1)
    
    # Update the metric with the batch predictions and reference labels.
    metric.add_batch(predictions=predictions, references=batch['labels'])

    # Concatenate batch predictions and labels to the overall tensors.
    test_predictions = torch.cat((test_predictions, predictions))
    test_labels = torch.cat((test_labels, batch['labels']))

# Compute the test accuracy.
test_acc = metric.compute()

# Log the test accuracy to Weights and Biases.
wandb.log(test_acc)

# Return the test accuracy.
test_acc

In [None]:
# Log the accuracies to wandb

wandb.log({'train_accuracy':train_acc['accuracy']})
wandb.log({'valid_accuracy':validation_acc['accuracy']})
wandb.log({'test_accuracy':test_acc['accuracy']})

In [None]:
# download functions for forming and plotting confusion matrices.
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
# Calculate the confusion matrix and plot it for the training set.
cm_train = confusion_matrix(train_labels.cpu().numpy(), train_predictions.cpu().numpy())
cm_display_train = ConfusionMatrixDisplay(cm_train)
cm_display_train.plot()

# Save the plot as an image file.
cm_display_train.figure_.savefig('conf_mat_train.png', dpi=300)

# Log the confusion matrix image to Weights and Biases.
wandb.log({"train_cm": wandb.Image("conf_mat_train.png")})

# Calculate the confusion matrix and plot it for the validation set.
cm_valid = confusion_matrix(valid_labels.cpu().numpy(), valid_predictions.cpu().numpy())
cm_display_valid = ConfusionMatrixDisplay(cm_valid).plot()

# Save the plot as an image file.
cm_display_valid.figure_.savefig('conf_mat_valid.png', dpi=300)

# Log the confusion matrix image to Weights and Biases.
wandb.log({"valid_cm": wandb.Image("conf_mat_valid.png")})

# Calculate the confusion matrix and plot it for the test set.
cm_test = confusion_matrix(test_labels.cpu().numpy(), test_predictions.cpu().numpy())
cm_display_test = ConfusionMatrixDisplay(cm_test).plot()

# Save the plot as an image file.
cm_display_test.figure_.savefig('conf_mat_test.png', dpi=300)

# Log the confusion matrix image to Weights and Biases.
wandb.log({"test_cm": wandb.Image("conf_mat_test.png")})

In [None]:
# Initialize an empty list to store the indices where predictions and labels are both incorrect.
stack = []

# Iterate through predictions and labels to find cases where both prediction and label are incorrect.
for i, (pred, label) in enumerate(zip(valid_predictions.cpu().numpy(), valid_labels.cpu().numpy())):
    if pred == 5 and label == 0:
        stack.append(i)

# Create an empty DataFrame to store text and labels.
max_confused = pd.DataFrame({'review_text': [], 'label': []})

# Iterate through the indices in the 'stack'.
for i in stack:
    # Retrieve the label and text from the dataset.
    label = valid_dataset[i]["labels"]
    text = tokenizer.decode(valid_dataset[i]["input_ids"], skip_special_tokens=True)
    
    # Print the label and text.
    print(f'label is {label}')
    print(f'Text:\n-----\n{text}')
    print('\n')
    
    # Append the label and text to the DataFrame.
    max_confused.loc[len(max_confused.index)] = [text, label.item()]

# Log the DataFrame as a Weights and Biases table.
wandb.log({'valid labels are off': wandb.Table(dataframe=max_confused)})

In [None]:
# Same as for previous code, but for testing on the test set.

stack_test=[]
for i, (pred,label) in enumerate(zip(test_predictions.cpu().numpy(),test_labels.cpu().numpy())):
    if pred==5 and label==0:
        stack_test.append(i)

max_confused_test=pd.DataFrame({'review_text':[],'label':[]})
for i in stack_test:
    label=test_dataset[i]["labels"]
    text=tokenizer.decode(test_dataset[i]["input_ids"],skip_special_tokens=True)
    print(f'label is {label}')
    print(f'Text:\n-----\n{text}')
    print('\n')
    max_confused_test.loc[len(max_confused_test.index)]=[text,label.item()]
    
wandb.log({'test labels are off':wandb.Table(dataframe=max_confused_test)})

In [None]:
from sklearn.metrics import f1_score

# Calculate the F1 scores for each label class.
f1_scores = f1_score(test_predictions.cpu().numpy(),
                     test_labels.cpu().numpy(),
                     average=None,
                     labels=np.arange(6))

# Create a dictionary to associate each label with its F1 score.
f1_scores_with_labels = {int(label): score for label, score in zip(labels, f1_scores)}

# Iterate through the labels and log the F1 scores to Weights and Biases.
for i in range(6):
    wandb.log({f'f1_score_{i}': f1_scores_with_labels[i]})