# Overview
In this homework, we will explore finetuning two separate models:
1. Distillbert for a sentiment classification task.
2. The recent OpenLlama-2-3b model to turn it into a chatbot.


In [None]:
# Install the dependencies
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git

In [None]:
!pip install -q -U datasets bitsandbytes einops

In [None]:
!pip install -U fsspec==2023.9.2

# Distillbert for Sentiment Classification


**Deliverables:**

Explore 2 different ways of tuning the model (# epochs, learning rate, weight decay, etc), in order to improve the classification perfomance. Detail the methodology that you followed for improving the performance of the model. A reasonable discusion on the approaches that you have taken is expected (points will be deducted for randomly changing the hyperparamenters of the model).

You will have to include in your report your accuracy, precision, recall and f1 scores. Also, you have to include the image of your confusion matrix in heatmap form.

In [None]:
import torch
torch.cuda.is_available()

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification

from sklearn.model_selection import train_test_split
import pandas as pd

from datasets import load_dataset, Dataset, DatasetDict

## DataSet

In [None]:
imdb_df = pd.read_csv("data/IMDB_dataset_clean.csv")

In [None]:
X_train, X_test = train_test_split(imdb_df, test_size=0.2, random_state=42)

In [None]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(X_train, preserve_index=False),
    "test": Dataset.from_pandas(X_test, preserve_index=False)
    })

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True)

# Tokenize the training and test sets
train_tokenized = dataset["train"].map(tokenize_function, batched=True)
test_tokenized = dataset["test"].map(tokenize_function, batched=True)

In [None]:
train_tokenized, test_tokenized

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Loading the model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

def compute_metrics(pred):
    labels = pred.label_ids
    probabilities = pred.predictions[:, 1]  # Assuming the probabilities for class 1

    preds = (probabilities > 0.5).astype(int)  # Thresholding at 0.5 to determine class

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    cm = confusion_matrix(labels, preds, labels=[0, 1])
    acc = accuracy_score(labels, preds)

    return {
        "y_true": labels,
        "y_pred": preds,
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1-score": f1,
        "confusion_matrix": cm
    }

In [None]:
training_args = TrainingArguments(
    output_dir="test_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=False,
    push_to_hub=False,

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
train_result = trainer.train()

In [None]:
eval_results = trainer.evaluate()

In [None]:
# Your evaulation code here

# Finetune OpenLlama-2-3b
The following shows how to fine-tune the recent OpenLlama-2-3b model on a single Google colab and turn it into a chatbot.

We will leverage PEFT library from Hugging Face ecosystem, as well as QLoRA for more memory efficient finetuning.

**Deliverables**
1. Experiment with 3 different settings for LORA and create a line plot line plot with the r hyper parameter on the x-axis. Include a discussion on the effects of changing the hyperparameter.

2. Write code to add one example to the dataset.

# Dataset

In [None]:
from datasets import load_dataset

dataset_name = 'gberseth/IFT6758-comments'
dataset = load_dataset(dataset_name, split="train")

In [None]:
dataset

In [None]:
dataset = dataset.map(lambda example: {'text': example['input'] + example['output']})

In [None]:
dataset

# Loading the Model

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

model_name = "openlm-research/open_llama_3b_v2"

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True
)
model.config.use_cache = False

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
from peft import LoraConfig, get_peft_model

lora_alpha = 8
lora_dropout = 0.1
lora_r = 8

In [None]:
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
from transformers import TrainingArguments

In [None]:
output_dir = "./results"
per_device_train_batch_size = 1
gradient_accumulation_steps = 2
optim = "paged_adamw_32bit"
save_steps = 1
num_train_epochs = 4
logging_steps = 1
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 200
warmup_ratio = 0.03
lr_scheduler_type = "linear"

In [None]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    report_to="none",
)

In [None]:
from trl import SFTTrainer

In [None]:
max_seq_length = 512

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

In [None]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [None]:
train_result = trainer.train()

In [None]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
model_to_save.save_pretrained("outputs")

In [None]:
lora_config = LoraConfig.from_pretrained('outputs')
model = get_peft_model(model, lora_config)

In [None]:
# Example of text generation
text = dataset['text'][5]
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))