<a href="https://colab.research.google.com/github/daywatch/LLM_and_ChatGPT/blob/main/SLM_fine_tuning/part3_bigbird_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import BigBirdTokenizer, BigBirdForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from datasets import Dataset

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## data preparation

In [None]:
df= pd.read_csv("/content/drive/MyDrive/BB2024/labeled_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,from (george pavlic) subject re pens playoff ...,3
1,1,from (michael lurie) the liberalizer subject r...,3
2,2,from subject the law of retribution organizat...,1
3,3,from (joseph h buehler) subject the ancient c...,1
4,4,from (peter j menchetti) subject microsoft pr...,2


In [None]:
# Split the data into train, validation, and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

In [None]:
# Convert pandas DataFrames to Huggingface Datasets
train_data = Dataset.from_pandas(train_df)
val_data = Dataset.from_pandas(val_df)
test_data = Dataset.from_pandas(test_df)

In [None]:
# Load model and tokenizer
tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
model = BigBirdForSequenceClassification.from_pretrained(
    'google/bigbird-roberta-base',
    gradient_checkpointing=False,
    num_labels=5,
    return_dict=True
).to(device)  # Push the model to GPU

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define a function to tokenize the dataset
def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding='max_length', truncation=True, max_length=2048)

In [None]:
# Apply the tokenization function to the datasets
train_data = train_data.map(tokenization, batched=True, batch_size=len(train_data))
val_data = val_data.map(tokenization, batched=True, batch_size=len(val_data))
test_data = test_data.map(tokenization, batched=True, batch_size=len(test_data))

# Set the format to return PyTorch tensors for the Trainer
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/3541 [00:00<?, ? examples/s]

Map:   0%|          | 0/394 [00:00<?, ? examples/s]

Map:   0%|          | 0/984 [00:00<?, ? examples/s]

## metrics

In [None]:
# Define accuracy metrics for train and validation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

def compute_test_metrics(labels, preds):
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


## training/fine-tuning params

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='content/path/to/output',
    num_train_epochs=4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    gradient_accumulation_steps=32,
    learning_rate=1e-5,
    fp16=True,  # Enable mixed precision for faster training
    logging_dir='/content/path/to/logs',
    logging_steps=4,
    load_best_model_at_end=True,
)


# Ensure all model parameters are contiguous before training
for param in model.parameters():
    param.data = param.data.contiguous()

# Trainer for training and validation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    compute_metrics=compute_metrics,
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


'cuda'

In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,0.8154,0.74642,0.832487
1,0.502,0.489482,0.852792
2,0.3861,0.446299,0.873096
3,0.3613,0.432184,0.870558


TrainOutput(global_step=220, training_loss=0.6342567465522072, metrics={'train_runtime': 2716.4018, 'train_samples_per_second': 5.214, 'train_steps_per_second': 0.081, 'total_flos': 1.4917815892783104e+16, 'train_loss': 0.6342567465522072, 'epoch': 3.9751552795031055})

In [None]:
# Compute predictions for the test set
test_preds = trainer.predict(test_data)

# Get labels and predictions
labels = test_data['label']
preds = test_preds.predictions.argmax(-1)

# Calculate test metrics
test_metrics = compute_test_metrics(labels, preds)  # Call the custom metrics function
print(test_metrics)

{'accuracy': 0.8932926829268293, 'precision': 0.8876199353523544, 'recall': 0.8932926829268293, 'f1': 0.8859254676520414}


## inference

In [None]:
# Save the trained model to /content after training
model.save_pretrained('/content/drive/MyDrive/BB2024')

In [None]:
df_infer = pd.read_csv("/content/drive/MyDrive/BB2024/inference_data.csv")

# Convert the pandas DataFrame to a Hugging Face Dataset without the 'label' column
infer_dataset = Dataset.from_pandas(df_infer)

infer_data = infer_dataset.map(tokenization, batched=True, batch_size=len(infer_dataset))

infer_data.set_format(type='torch', columns=['input_ids', 'attention_mask'])

predictions = trainer.predict(infer_data)

predicted_classes = predictions.predictions.argmax(-1)

df_infer['predicted_label'] = predicted_classes

Map:   0%|          | 0/142 [00:00<?, ? examples/s]

In [None]:
df_final = df_infer[['text', 'predicted_label']].rename(columns={'predicted_label':'predicted_topic'})

Unnamed: 0,text,predicted_topic
0,from (rajiev gupta) subject re windows nt faq...,2
1,from (joachim lous) subject re tiff philosoph...,2
2,from (robert weiss) subject 17 apr 93 god's...,1
3,from (dale leonard) subject re wise to remove...,2


In [None]:
df_final.loc[df_final['predicted_topic'] == 0, 'predicted_topic'] = 1
df_final.to_csv("/content/drive/MyDrive/BB2024/test_data_with_predictions.csv")