# QLoRA & Quantization Fine Tuning Demo Notebook

In [None]:
import os
import pandas as pd
import time
import torch

from datasets import Dataset, DatasetDict, load_dataset
from dotenv import dotenv_values
from huggingface_hub import HfApi, HfFolder
from peft import LoraConfig, AutoPeftModelForCausalLM, prepare_model_for_kbit_training, get_peft_model
from pympler import asizeof
from sklearn.model_selection import train_test_split
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, DataCollatorForLanguageModeling, GPTQConfig, TrainingArguments, Trainer, QuantoConfig

secrets = dotenv_values(".env")
HUGGINGFACE_TOKEN = secrets['HUGGINGFACE_TOKEN']
HfFolder.save_token(HUGGINGFACE_TOKEN)
print("saved")

### Load training data

In [None]:
MAX_ROWS = 1000
stackoverflow_df = pd.read_csv('train.csv')[:MAX_ROWS]
stackoverflow_df

In [None]:
# Constants
SEED = 999
TEST_SIZE = 0.2
VALIDATION_SIZE = 0.25  # This is 0.25 of the 80% after the initial split

# Stack Overflow prompt template
stackoverflow_prompt_template = """[INST]
Consider the following stackoverflow question:

Title: {title}

Body: {body}

Tags: {tags}

Choose between one of these three tags: HQ, LQ_EDIT, and LQ_CLOSE.

HQ: High-quality posts without a single edit.
LQ_EDIT: Low-quality posts with a negative score, and multiple community edits. However, they remain open.
LQ_CLOSE: Low-quality posts that were closed by the community without a single edit.

Only respond with either HQ, LQ_EDIT, or LQ_CLOSE. [/INST]
"""


MAX_BODY_CHAR_LEN = 1000

# Columns
stackoverflow_df['Body_short'] = stackoverflow_df['Body'].str.slice(0, MAX_BODY_CHAR_LEN)

def create_mistral_prompt(row):
    return stackoverflow_prompt_template.format(
        title=row['Title'],
        body=row['Body_short'], 
        tags=row['Tags'], 
        correct_label=row['Y']
    )
def create_mistral_training_prompt(row):
    return "<s>" + stackoverflow_prompt_template.format(
        title=row['Title'],
        body=row['Body_short'], 
        tags=row['Tags'], 
    ) + f"\n{row['Y']}</s>"

stackoverflow_df['mistral_prompt'] = stackoverflow_df.apply(create_mistral_prompt, axis=1)
stackoverflow_df['mistral_training_prompt'] = stackoverflow_df.apply(create_mistral_training_prompt, axis=1)

train_cols = ['mistral_prompt', 'mistral_training_prompt']
y_col = ['Y']

# Initial split to get test set
X_train_full, X_test, y_train_full, y_test = train_test_split(
    stackoverflow_df[train_cols], 
    stackoverflow_df[y_col], 
    test_size=TEST_SIZE, 
    random_state=SEED
)

# Further split the training set to get validation set
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, 
    y_train_full, 
    test_size=VALIDATION_SIZE, 
    random_state=SEED
)

# Output shapes to verify the split
print(X_train.shape, X_val.shape, X_test.shape)
stackoverflow_df.head(3)

### Prepare quantized model for fine tuning process

In [None]:
torch.cuda.empty_cache()

model_id = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=False,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    torch_dtype=torch.float16, 
    quantization_config=nf4_config,
    device_map="cuda:0"
)

model.train()

# enable gradient check pointing
model.gradient_checkpointing_enable()

# enable quantized training
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.5,
    target_modules=["q_proj", "v_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

# LoRA trainable version of model
model = get_peft_model(model, lora_config)

# trainable parameter count
model.print_trainable_parameters()

### Tokenize training data

In [None]:
MAX_TOKEN_LEN = 512

def split_dataframe(df, train_size=0.6, test_size=0.2, random_state=None):
    train_df, remaining_df = train_test_split(
        df, 
        train_size=train_size, 
        random_state=random_state
    )
    val_test_ratio = test_size / (1 - train_size)
    val_df, test_df = train_test_split(
        remaining_df, 
        test_size=val_test_ratio, 
        random_state=random_state
    )
    return train_df, val_df, test_df


train_df, val_df, test_df = split_dataframe(
    stackoverflow_df, 
    random_state=42
)

# Create datasets
train_df['mistral_training_prompt'].head(200).to_csv('train_set.csv', index=False)
val_df['mistral_training_prompt'].head(200).to_csv('val_set.csv', index=False)
test_df['mistral_training_prompt'].head(200).to_csv('test_set.csv', index=False)

dataset = None
dataset = load_dataset('csv', data_files={'train': "train_set.csv",'validation': "val_set.csv"})

print(f"Size of training dataset: {len(dataset['train'])}")
print(f"Size of validation dataset: {len(dataset['validation'])}")

def generate_and_tokenize_prompt(examples):
    result = tokenizer(
        examples["mistral_training_prompt"],
        truncation=True,
        max_length=MAX_TOKEN_LEN,
        padding="max_length",
    )
    return result

tokenized_data = dataset.map(generate_and_tokenize_prompt, batched=True)
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

### Fine Tune The Mistral Model

In [None]:
BATCH_SIZE = 8
NUM_EPOCHS = 4
LEARNING_RATE = 0.0001
WEIGHT_DECAY = 0.02
WARMUP_STEPS = 2

# Prevent any existing instance from conflicting
training_args = None
trainer = None

# define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    warmup_steps=WARMUP_STEPS
)
# configure trainer
trainer = Trainer(
    model=model,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    args=training_args,
    data_collator=data_collator
)

# train model
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

# renable warnings
model.config.use_cache = True


### Evaluate the performance on the holdout test set

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Prepare the evaluation function
def evaluate_model(model, tokenizer, _X_test, _y_test):
    print(f"evaluating {len(_X_test)} rows of test data")
    model.eval()
    predictions = []
    true_labels = []

    for i, row in _X_test.iterrows():
        inputs = tokenizer(
            row['mistral_prompt'], 
            return_tensors="pt", 
            truncation=True, 
            padding=True, 
            max_length=MAX_TOKEN_LEN
        ).to("cuda:0")

        outputs = model.generate(**inputs, max_new_tokens=6)
        text_response = tokenizer.decode(
            outputs[0], 
            skip_special_tokens=True,
        )[len(row['mistral_prompt']):].strip()    
        if "HQ" in text_response:
            predictions.append("HQ")
        elif "LQ_EDIT" in text_response:
            predictions.append("LQ_EDIT")
        elif "LQ_CLOSE" in text_response:
            predictions.append("LQ_CLOSE")
        else:
            title_50 = row['mistral_prompt'][55:105]
            print(f"WARNING: unknown found for {title_50}")
            predictions.append("UNKNOWN")

        true_labels.append(_y_test.loc[row.name, 'Y'])

    accuracy = accuracy_score(true_labels, predictions)
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(true_labels, predictions, average='macro', zero_division=0)
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(true_labels, predictions, average='micro', zero_division=0)

    return {
        'accuracy': accuracy,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro,
        'precision_micro': precision_micro,
        'recall_micro': recall_micro,
        'f1_micro': f1_micro
    }

# Evaluate the model
metrics = evaluate_model(
    model, 
    tokenizer, 
    X_test, 
    y_test
)

print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"Precision Macro: {metrics['precision_macro']:.4f}")
print(f"Recall Macro: {metrics['recall_macro']:.4f}")
print(f"F1 Score Macro: {metrics['f1_macro']:.4f}")

### Performance Metrics Over Test Set

##### **Accuracy:** the ratio of correctly predicted observations to the total observations, representing the overall effectiveness of a classification model across multiple classes by measuring the proportion of true results (both true positives and true negatives) among the total number of cases examined.

##### **Precision Macro:** the average precision (the ratio of correctly predicted positive observations to all predicted positives) across all classes, ensuring each class is given equal importance regardless of its size or frequency in the data.

##### **Recall Macro:** the average recall (the ratio of correctly predicted positive observations to all actual positives) across all classes, treating each class equally.

##### **F1 Score Macro:** the harmonic mean of precision and recall for each class independently, averaging these scores ensuring that each class contributes equally to the overall metric.

##### **F1 Score Micro:** aggregates the contributions of all classes to compute the overall precision and recall, and then calculating their harmonic mean, effectively giving equal weight to each individual instance rather than each class.


In [None]:
print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"Precision Macro: {metrics['precision_macro']:.4f}")
print(f"Recall Macro: {metrics['recall_macro']:.4f}")
print(f"F1 Score Macro: {metrics['f1_macro']:.4f}")
print(f"F1 Score Micro: {metrics['f1_micro']:.4f}")