## Packages & Libraries

In [1]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U accelerate
# %pip install -U transformers accelerate
%pip install -U peft
%pip install -U trl
%pip install GPUtil
# %pip install evaluate

In [2]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, 
                             recall_score,
                             precision_score,
                             f1_score,
                             classification_report, 
                             confusion_matrix)

import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import DatasetDict, Dataset, load_dataset
from peft import LoraConfig, PeftConfig, get_peft_model, TaskType
from trl import setup_chat_format, SFTTrainer
import transformers
from transformers import (AutoTokenizer, 
                          AutoModelForCausalLM,
                          AutoModelForSequenceClassification,
                          BitsAndBytesConfig,
                          Trainer,
                          TrainingArguments, 
                          pipeline, 
                          logging,
                          DataCollatorWithPadding)

from huggingface_hub import login as hf_login
from kaggle_secrets import UserSecretsClient
import wandb

## Functions & Configurations

In [3]:
data_loc = "/kaggle/input/"
model_path = "meta-llama/Llama-3.2-3B-Instruct"
output_dir="llama-3.2-fine-tuned-model"

# Tokens
user_secrets = UserSecretsClient()
access_token = user_secrets.get_secret("hf_read_token_access")
wb_access_token = user_secrets.get_secret("wanda_token")
hf_login(access_token)

wandb.login(key=wb_access_token)
run = wandb.init(
    project='Fine-tune Llama-3.2-3B-Instruct for Short Answer Grading', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mgabriel-ferreira[0m ([33mgabriel-ferreira-iowa-state-university[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
# Define the prompt generation functions
def generate_prompt(data_point):
    return f"""
            You are a professor and need to grade the student response as Correct, Incorrect, or Partially Correct. Then return your decision with the corresponding grade label.
Question: {data_point['Question']}
Response: {data_point['Response']}
Correct Answer: {data_point['CorrectAnswer']}
label: {data_point['label_text']}
            """.strip()

def generate_test_prompt(data_point):
    return f"""
            You are a professor and need to grade the student response as Correct, Incorrect, or Partially Correct. Then return your decision with the corresponding grade label.
Question: {data_point['Question']}
Response: {data_point['Response']}
Correct Answer: {data_point['CorrectAnswer']}
label:
            """.strip()

# Define text preprocessing
def preprocess_function(example):
    tokens = tokenizer(example['text'], truncation=True, padding='max_length', max_length=256)
    return tokens
    
def predict(test, model, tokenizer):
    y_pred = []
    categories = [ "Correct", "Partially Correct", "Incorrect"]
    
    
    
    pipe = pipeline(task="text-generation", 
                    model=model, 
                    tokenizer=tokenizer, 
                    max_new_tokens=2, 
                    temperature=0.1)

    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["input_text"]
        result = pipe(prompt)
        # print(result[0]['generated_text'])
        answer = result[0]['generated_text'].split("label:")[-1].strip()
        # print(answer)
        # Determine the predicted category
        for category in categories:
            if answer.lower() in category.lower():
                y_pred.append(category)
                break
        else:
            y_pred.append("none")
    
    return y_pred

def evaluate(y_true, y_pred):
    labels = [ "Correct", "Partially Correct", "Incorrect"]
    mapping = {label: idx for idx, label in enumerate(labels)}
    
    def map_func(x):
        return mapping.get(x, -1)  # Map to -1 if not found, but should not occur with correct data
    
    y_true_mapped = np.vectorize(map_func)(y_true)
    y_pred_mapped = np.vectorize(map_func)(y_pred)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate accuracy report
    unique_labels = set(y_true_mapped)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true_mapped)) if y_true_mapped[i] == label]
        label_y_true = [y_true_mapped[i] for i in label_indices]
        label_y_pred = [y_pred_mapped[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {labels[label]}: {label_accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true_mapped, y_pred=y_pred_mapped, target_names=labels, labels=list(range(len(labels))))
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true_mapped, y_pred=y_pred_mapped, labels=list(range(len(labels))))
    print('\nConfusion Matrix:')
    print(conf_matrix)

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

## Preparing Dataset

In [5]:
# Load data
df_train = pd.read_csv(os.path.join(data_loc, "train.csv"), encoding='windows-1252')
df_test = pd.read_csv(os.path.join(data_loc, "test.csv"), encoding='windows-1252')

# Label Mapping
label_map = {-1: "Incorrect", 0: "Partially Correct", 1: "Correct"}

# Replace numeric labels with string labels
df_train['label_text'] = df_train['label'].map(label_map)
# df_train = df_train.iloc[0:1000]
df_test['label_text'] = df_test['label'].map(label_map)

# Prepare Input Text
df_train['input_text'] = df_train.apply(lambda x: generate_prompt(x), axis=1).tolist()
df_test['input_text'] = df_test.apply(lambda x: generate_test_prompt(x), axis=1).tolist()

# Define predictor and target features
X = df_train['input_text']
y = df_train['label_text']

# Split train and validation
X_train, X_val = train_test_split(df_train, test_size=.2, random_state=42)

# Set test
X_test = pd.DataFrame(df_test['input_text'], columns=["input_text"])
y_test = df_test['label_text']

df_train.head()

Unnamed: 0,Experiment,Topic,ID,Question,Response,CorrectAnswer,label,label_text,input_text
0,1,Physics,104,How thin can a fiber optic be?,a strand of hair,As thin as a human hair,1,Correct,You are a professor and need to grade the stud...
1,1,Physics,126,How thin can a fiber optic be?,Really thin and small,As thin as a human hair,-1,Incorrect,You are a professor and need to grade the stud...
2,1,Physics,130,How thin can a fiber optic be?,as thin as a human hair,As thin as a human hair,1,Correct,You are a professor and need to grade the stud...
3,1,Physics,131,How thin can a fiber optic be?,Very thin smaller than a pice of hair,As thin as a human hair,1,Correct,You are a professor and need to grade the stud...
4,1,Physics,156,How thin can a fiber optic be?,Less than the width of a human hair,As thin as a human hair,1,Correct,You are a professor and need to grade the stud...


In [6]:
# Label distribution
X_train.label_text.value_counts()

label_text
Correct              951
Incorrect            800
Partially Correct     49
Name: count, dtype: int64

In [7]:
# Training Data
train_data = {"text": X_train['input_text']}
train_dataset = Dataset.from_dict(train_data)

# Validation Data
val_data = {"text": X_val['input_text']}
val_dataset = Dataset.from_dict(val_data)

# # Test Data
# test_data = {"text": X_test[['input_text']], "labels": y_test.astype(str).tolist()}
# test_dataset = Dataset.from_dict(test_data)

# Dataset Dictionary 
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    # "test": test_dataset
})

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1800
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 450
    })
})

In [8]:
print(dataset_dict['train']['text'][0])

You are a professor and need to grade the student response as Correct, Incorrect, or Partially Correct. Then return your decision with the corresponding grade label.
Question: What is the name for light of a very precise wavelength and color?
Response: laser
Correct Answer: Monochromatic
label: Incorrect


## Model Demonstration - Generation

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

model = AutoModelForCausalLM.from_pretrained(
        model_path,
        token=access_token,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
        
)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
    token=access_token,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [10]:
n=2
messages = [{"role": "user", "content": f"""
You are a professor and need to grade the student response as Correct, Incorrect, or Partially Correct. Then return your decision with the corresponding grade label.
Question: {df_train['Question'][n]}
Response: {df_train['Response'][n]}
Correct Answer: {df_train['CorrectAnswer'][n]}
label: {df_train['label_text'][n]}
            """}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=120, do_sample=True)
print(outputs[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 12 Apr 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

You are a professor and need to grade the student response as Correct, Incorrect, or Partially Correct. Then return your decision with the corresponding grade label.
Question: How thin can a fiber optic be?
Response: as thin as a human hair
Correct Answer: As thin as a human hair
label: Correct<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Correct


## Model Fine-Tunning

In [11]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_path)

tokenizer.pad_token_id = tokenizer.eos_token_id

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Quick Inference Test

In [12]:
y_pred = predict(X_test[0:1], model, tokenizer)
y_pred

Device set to use cuda:0
100%|██████████| 1/1 [00:00<00:00,  5.46it/s]


['Correct']

In [13]:
y_test[0:1]

0    Correct
Name: label_text, dtype: object

In [14]:
evaluate(y_test[0:1], y_pred)

Accuracy: 1.000
Accuracy for label Correct: 1.000

Classification Report:
                   precision    recall  f1-score   support

          Correct       1.00      1.00      1.00         1
Partially Correct       0.00      0.00      0.00         0
        Incorrect       0.00      0.00      0.00         0

        micro avg       1.00      1.00      1.00         1
        macro avg       0.33      0.33      0.33         1
     weighted avg       1.00      1.00      1.00         1


Confusion Matrix:
[[1 0 0]
 [0 0 0]
 [0 0 0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Training Config Args

In [15]:
# Preprocess all datasets
tokenized_data = dataset_dict.map(preprocess_function, batched=True)

print(tokenized_data['train']['text'][0])

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

You are a professor and need to grade the student response as Correct, Incorrect, or Partially Correct. Then return your decision with the corresponding grade label.
Question: What is the name for light of a very precise wavelength and color?
Response: laser
Correct Answer: Monochromatic
label: Incorrect


In [16]:
modules = find_all_linear_names(model)
modules

['v_proj', 'q_proj', 'gate_proj', 'k_proj', 'up_proj', 'o_proj', 'down_proj']

In [17]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules,
)

training_arguments = TrainingArguments(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=5,                       # number of training epochs
    per_device_train_batch_size=1,            # batch size per device during training
    gradient_accumulation_steps=8,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    logging_steps=1,                         
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=False,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="wandb",                  # report metrics to w&b
    eval_strategy="steps",              # save checkpoint every epoch
    eval_steps = 0.2
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['validation'],
    peft_config=peft_config,
    # dataset_text_field="text",
    # tokenizer=tokenizer,
    # max_seq_length=512,
    # packing=False,
    # dataset_kwargs={
    # "add_special_tokens": False,
    # "append_concat_token": False,
    # }
)

Truncating train dataset:   0%|          | 0/1800 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/450 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


### Fine-Tune

In [18]:
trainer.train()



Step,Training Loss,Validation Loss
225,0.0494,0.066209
450,0.049,0.065376
675,0.0525,0.066548
900,0.0298,0.070733
1125,0.0307,0.075519


TrainOutput(global_step=1125, training_loss=0.07372449576192432, metrics={'train_runtime': 10962.8084, 'train_samples_per_second': 0.821, 'train_steps_per_second': 0.103, 'total_flos': 4.0310822928384e+16, 'train_loss': 0.07372449576192432})

## Model Evaluation

In [19]:
y_pred = predict(X_test, model, tokenizer)
evaluate(y_test, y_pred)

Device set to use cuda:0
  0%|          | 10/30466 [00:02<2:06:30,  4.01it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 30466/30466 [2:05:52<00:00,  4.03it/s]  

Accuracy: 0.948
Accuracy for label Correct: 0.940
Accuracy for label Partially Correct: 0.353
Accuracy for label Incorrect: 0.965

Classification Report:
                   precision    recall  f1-score   support

          Correct       0.97      0.94      0.95     13532
Partially Correct       0.16      0.35      0.22       320
        Incorrect       0.97      0.97      0.97     16614

         accuracy                           0.95     30466
        macro avg       0.70      0.75      0.71     30466
     weighted avg       0.96      0.95      0.95     30466


Confusion Matrix:
[[12719   340   473]
 [   99   113   108]
 [  317   262 16035]]





In [23]:
wandb.finish(exit_code=0)
model.config.use_cache = True

0,1
eval/loss,▂▁▂▅█
eval/mean_token_accuracy,▁█▆▅▃
eval/num_tokens,▁▃▅▆█
eval/runtime,█▁▂▄▅
eval/samples_per_second,▁██▁▁
eval/steps_per_second,▁▁▁▁▁
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇███████
train/global_step,▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇███
train/grad_norm,█▄▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▁▂▂▁
train/learning_rate,▂▄▄███████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁

0,1
eval/loss,0.07552
eval/mean_token_accuracy,0.98407
eval/num_tokens,2304000.0
eval/runtime,128.2001
eval/samples_per_second,3.51
eval/steps_per_second,0.445
total_flos,4.0310822928384e+16
train/epoch,5.0
train/global_step,1125.0
train/grad_norm,0.04046


In [21]:
# # Save trained model and tokenizer
# trainer.save_model(output_dir)
# tokenizer.save_pretrained(output_dir)

In [24]:
import shutil
from IPython.display import FileLink, display

checkpoint = 'checkpoint-1125'

# Compress the checkpoint folder into a zip file.
shutil.make_archive(f'/kaggle/working/{output_dir}/{checkpoint}', 'zip', f'/kaggle/working/{output_dir}/{checkpoint}')

FileLink(f"{output_dir}/{checkpoint}.zip")