## Install Libraries and Import Packages


In [1]:
#Install packages

!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U bitsandbytes
!pip install -q -U datasets
!pip install -q -U peft
!pip install -q -U wandb
!pip install -q -U trl
!pip install -q -U huggingface_hub
!pip install -q -U rouge_score
!pip install bert-score

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m92.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.1/315.1 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.7/19.7 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
#Import packages

import matplotlib.pyplot as plt
import wandb, os
import json
from datetime import datetime
from collections import defaultdict

#Model metrics
from rouge_score import rouge_scorer
from bert_score import BERTScorer

In [3]:
#Transformers & Huggingface
import torch
import transformers
from datasets import Dataset, DatasetDict, load_dataset, load_metric
from huggingface_hub import notebook_login
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    PeftModel,
    get_peft_model
)
from transformers import (
    AutoModel,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    pipeline
)

## Log in to Huggingface & WandB & Mount GDrive

In [4]:
#Log in to huggingface

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
#Log in to weights and biases

wandb.login()

wandb_project = "mistralinstruct_climate_finetune"
if len(wandb_project)>0:
  os.environ['WANDB_PROJECT'] = wandb_project

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
#Mount google drive for saving model
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Fine-Tune Mistral Instruct

In [None]:
#Load Pira 2.0 dataset
mcqa = load_dataset("paulopirozelli/pira", "mcqa", trust_remote_code=True)

#Define full train and validation datasets
train_dataset = mcqa['train']
val_dataset = mcqa['validation']

#Define subset of train and validation datasets for testing model params
subset_train_dataset = load_dataset("paulopirozelli/pira", "mcqa", trust_remote_code=True, split='train[0:300]')
subset_val_dataset = load_dataset("paulopirozelli/pira", "mcqa", trust_remote_code=True, split='validation[0:100]')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/269k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/256k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1798 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/225 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/227 [00:00<?, ? examples/s]

In [None]:
#Reformat input into compatible format for model training

def reformat_input(data):
  context = data['text']
  question = data['question']
  answer = data['correct']

  # Constructing the formatted output
  formatted_input = f'''[INST]
                        ### Instruction: Answer the question based on the provided context.
                        ### Context: {context}
                        ### Question: {question}
                        [/INST]
                        ### Answer: {answer}'''

  return formatted_input

In [None]:
#Define model quantization params

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

device = 'cuda'

In [None]:
#Load Mistral 7B Instruct baseline model

model_name = "mistralai/Mistral-7B-Instruct-v0.3"

model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quant_config,
        device_map="auto",
        use_cache=False,
        trust_remote_code=True,
    )

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
#Load tokenizer

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side="right",
    add_eos_token=True,                   #Add end of sentence token
    add_bos_token=True,                   #Add beginning of sentence token
)
tokenizer.pad_token = tokenizer.unk_token #Set pad token to unknown

tokenizer_config.json:   0%|          | 0.00/138k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
#Reformat and tokenize the input

def generate_and_tokenize_prompt(prompt):
    reformat_prompt = reformat_input(prompt)
    result = tokenizer(
        reformat_prompt,
        truncation = True,
        max_length=1024,
        padding="max_length"
        )
    result['labels'] = result['input_ids'].copy()
    return result

In [None]:
#Reformat and tokenize input datasets
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt, remove_columns=['id', 'text', 'question', 'A', 'B', 'C', 'D', 'E', 'alternative', 'correct'])
tokenized_val_dataset = val_dataset.map(generate_and_tokenize_prompt, remove_columns=['id', 'text', 'question', 'A', 'B', 'C', 'D', 'E', 'alternative', 'correct'])

tokenized_subset_train_dataset = subset_train_dataset.map(generate_and_tokenize_prompt, remove_columns=['id', 'text', 'question', 'A', 'B', 'C', 'D', 'E', 'alternative', 'correct'])
tokenized_subset_val_dataset = subset_val_dataset.map(generate_and_tokenize_prompt, remove_columns=['id', 'text', 'question', 'A', 'B', 'C', 'D', 'E', 'alternative', 'correct'])

Map:   0%|          | 0/1798 [00:00<?, ? examples/s]

Map:   0%|          | 0/225 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
#Define LORA Config
peft_config = LoraConfig(
    lora_alpha=32,      #Set to 2*r
    lora_dropout=0.1,
    r=16,               #Lower to reduce overfitting & computational complexity, 32 is standard per research
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [None]:
#Evaluate trainable params of LORA model

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")

# get frozen vs trainable model param statistics
print_trainable_parameters(model)

trainable params: 42532864 || all params: 3800895488 || trainable%: 1.1190221918566996


In [None]:
#Define project name

project = "mistralinstruct_climate_finetune"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = F"/content/gdrive/MyDrive/{run_name}"

### Test Run with Subset of Test Data

In [None]:
args = TrainingArguments(
  output_dir = output_dir,
  num_train_epochs=5,
  # max_steps = 40,
  per_device_train_batch_size = 8,
  gradient_accumulation_steps = 2,
  gradient_checkpointing = True,
  optim = "paged_adamw_32bit",
  logging_steps=25,
  save_strategy="epoch",
  learning_rate=2e-4,
  bf16=True,
  tf32=True,
  max_grad_norm=0.3,
  warmup_ratio = 0.03,
  lr_scheduler_type='constant',
  disable_tqdm=False,
  # eval_strategy="epoch",
  eval_strategy="steps",
  eval_steps=10, #Change to 100 on full dataset
  report_to="wandb",                                                          # Report metrics to Weights and Biases
  run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
)

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_subset_train_dataset,
    eval_dataset=tokenized_subset_val_dataset,
    args=args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

In [None]:
import time
start = time.time()
trainer.train()
print(time.time()- start)

[34m[1mwandb[0m: Currently logged in as: [33mfayetitchenal[0m ([33mfayetitchenal-university-of-california-berkeley[0m). Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss
10,No log,1.801013
20,No log,1.685887
30,1.711000,1.63419
40,1.711000,1.558366
50,1.032000,1.53181
60,1.032000,1.538361
70,1.032000,1.602186




KeyboardInterrupt: 

### Actual Run w/ Full Test Set

In [None]:
args = TrainingArguments(
  output_dir = output_dir,
  num_train_epochs=3,               #Experiments show overfitting after 3 epochs, set to 3
  per_device_train_batch_size = 8,
  gradient_accumulation_steps = 2,
  gradient_checkpointing = True,
  optim = "paged_adamw_32bit",
  logging_steps=25,
  save_strategy="epoch",
  learning_rate=2e-4,
  bf16=True,
  tf32=True,
  max_grad_norm=0.3,
  warmup_ratio = 0.03,
  lr_scheduler_type='constant',
  disable_tqdm=False,
  eval_strategy="steps",
  eval_steps=50,                   #Evaluate on val data every 50 steps
  report_to="wandb",                                                          # Report metrics to Weights and Biases
  run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
)

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

In [None]:
import time
start = time.time()
trainer.train()
print(time.time()- start)

[34m[1mwandb[0m: Currently logged in as: [33mfayetitchenal[0m ([33mfayetitchenal-university-of-california-berkeley[0m). Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss
50,1.5446,1.401947
100,1.1129,0.976945
150,0.6082,0.735918
200,0.5066,0.58341
250,0.2138,0.527286
300,0.234,0.468893




2923.7803285121918


In [None]:
#Push fine-tuned model to huggingface hub
trainer.push_to_hub()



Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/707M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/fayetitchenal/mistral-mistralinstruct_climate_finetune/commit/7468208ad04e2ef298aa6727375413e2672eba09', commit_message='End of training', commit_description='', oid='7468208ad04e2ef298aa6727375413e2672eba09', pr_url=None, pr_revision=None, pr_num=None)

## Evaluate fine-tuned model

In [6]:
#Reload Mistral 7B Instruct baseline model

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

device = 'cuda'

model_name = "mistralai/Mistral-7B-Instruct-v0.3"

model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quant_config,
        device_map="auto",
        use_cache=False,
        trust_remote_code=True,
    )

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [7]:
#Reload tokenizer

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side="right",
    add_eos_token=True,                   #Add end of sentence token
    add_bos_token=True,                   #Add beginning of sentence token
)
tokenizer.pad_token = tokenizer.unk_token #Set pad token to unknown

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [8]:
#Load fine-tuned model with base model

peft_ft_model = "fayetitchenal/mistral-mistralinstruct_climate_finetune"
ft_model = PeftModel.from_pretrained(model, peft_ft_model)

adapter_config.json:   0%|          | 0.00/751 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/707M [00:00<?, ?B/s]

In [9]:
#Define function to reformat test data input

def reformat_input(data):
  context = data['text']
  question = data['question']

  # Constructing the formatted output
  formatted_input = f'''[INST]
                        ### Instruction: Answer the question based on the provided context.
                        ### Context: {context}
                        ### Question: {question}
                        [/INST]
                        ### Answer: '''

  return formatted_input

In [10]:
#Relooad Pira 2.0 dataset
mcqa = load_dataset("paulopirozelli/pira", "mcqa", trust_remote_code=True)

#Define test dataset
test_dataset = mcqa['test']
test_labels = test_dataset['correct']

Downloading readme:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/269k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/256k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1798 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/225 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/227 [00:00<?, ? examples/s]

In [11]:
#Run inference on fine-tuned model

import time
start = time.time()

output=[]

for i in range(len(test_dataset)):
  eval_prompt = reformat_input(test_dataset[i])
  model_input = tokenizer(eval_prompt,
                          return_tensors="pt").to("cuda")


  ft_model.eval()
  with torch.no_grad():
      model_output = tokenizer.decode(ft_model.generate(**model_input,
                                                        max_new_tokens=40,
                                                        num_beams = 5,
                                                        early_stopping=True,
                                                        pad_token_id=0)[0], skip_special_tokens=True)
      output.append(model_output)

print(time.time()- start)


451.5981068611145


In [None]:
#Save model output as JSON file

def save_model_output_to_json(output_data, file_path):
    json_output = json.dumps(output_data, indent=4)

    # Save JSON output to a file
    with open(file_path, 'w') as file:
        file.write(json_output)

    print(f"JSON output has been saved to {file_path}")

In [None]:
save_model_output_to_json(output, "/content/gdrive/MyDrive/ft_mistralinstruct_output.json")

JSON output has been saved to /content/gdrive/MyDrive/ft_mistralinstruct_output.json


In [None]:
#Calculate ROUGE scores

# Function to calculate ROUGE scores for the entire dataset
def calculate_rouge_scores(hypothesis_list, reference_list):
    # Initialize dictionaries to store cumulative scores
    total_scores = defaultdict(lambda: defaultdict(float))
    all_f1_scores = defaultdict(list)
    count = len(hypothesis_list)

    # Initialize the ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Iterate over each pair of hypothesis and reference
    for hypothesis, reference in zip(hypothesis_list, reference_list):
        score = scorer.score(reference, hypothesis)
        # Accumulate scores
        for metric, result in score.items():
            total_scores[metric]['precision'] += result.precision
            total_scores[metric]['recall'] += result.recall
            total_scores[metric]['fmeasure'] += result.fmeasure
            all_f1_scores[metric].append(result.fmeasure)

    # Calculate average scores
    avg_scores = {}
    for metric, results in total_scores.items():
        avg_scores[metric] = {key: value / count for key, value in results.items()}

    return avg_scores, all_f1_scores

In [None]:
#Calculate BERTscore

def compute_bertscore(list1, list2, model_type='bert-base-uncased'):
    # Ensure both lists have the same length
    if len(list1) != len(list2):
        raise ValueError("Both lists must have the same length")

    # Initialize the BERTScorer
    scorer = BERTScorer(model_type=model_type)

    # Lists to store the scores
    P_scores, R_scores, F1_scores = [], [], []

    # Iterate over the lists and compute the scores
    for output, label in zip(list1, list2):
        P, R, F1 = scorer.score([output], [label])
        P_scores.append(P.mean().item())
        R_scores.append(R.mean().item())
        F1_scores.append(F1.mean().item())

    # Calculate the mean scores
    mean_P = sum(P_scores) / len(P_scores)
    mean_R = sum(R_scores) / len(R_scores)
    mean_F1 = sum(F1_scores) / len(F1_scores)

    return mean_P, mean_R, mean_F1, F1_scores

In [None]:
#Extract answer from model output

output2 = []

for i in range(len(output)):

  #Given string
  input_string = output[i]

  # Split the string at "ANSWER:"
  split_string = input_string.split("Answer:", 1)

  # Check if the split was successful and get the part after "ANSWER:"
  if len(split_string) > 1:
      answer_part = split_string[1].strip()
      answer_part = answer_part.replace("\n", " ").strip()
      output2.append(answer_part)
  else:
      print("The delimiter 'Answer:' was not found in the string.")

In [None]:
# Get average ROUGE scores for the entire dataset
avg_rouge_scores, all_rouge_scores = calculate_rouge_scores(output2, test_labels)

# Print the average ROUGE scores
print("Average ROUGE scores for the entire dataset:")
for metric, results in avg_rouge_scores.items():
    print(f"{metric}: Precision: {results['precision']:.4f}, Recall: {results['recall']:.4f}, F1: {results['fmeasure']:.4f}")

Average ROUGE scores for the entire dataset:
rouge1: Precision: 0.6044, Recall: 0.5948, F1: 0.5521
rouge2: Precision: 0.4374, Recall: 0.4266, F1: 0.3972
rougeL: Precision: 0.5710, Recall: 0.5607, F1: 0.5209


In [None]:
#Compute BERTscore

mean_P, mean_R, mean_F1, F1_scores = compute_bertscore(output2, test_labels)
print(f"BERTScore Precision: {mean_P:.4f}, Recall: {mean_R:.4f}, F1: {mean_F1:.4f}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERTScore Precision: 0.7086, Recall: 0.7151, F1: 0.7051


In [None]:
scores = dict(all_rouge_scores)
scores['f1_bertscores'] = F1_scores
save_model_output_to_json(scores, '/content/gdrive/MyDrive/mistral_finetune_1_scores')

JSON output has been saved to /content/gdrive/MyDrive/mistral_finetune_1_scores
