In [None]:
from google.colab import drive
drive.mount("/content/drive")

import os
ROOT_PATH = "/content/drive/MyDrive/MNLP/project-m2-2024-ab-eh-me/M2_Clean/project-m2-2024-ab-eh-me/model"  # Replace with your directory to A3 folder
os.chdir(ROOT_PATH) # cd into directory

In [None]:
!pip install -r requirements.txt
!pip install bert_score
!pip install rouge
# Not sure if bleu anf meteor are already installed?

In [None]:
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset
import evaluate
from unsloth import FastLanguageModel
import utils
import torch


In [None]:
# Load your model and tokenizer
model_name = "Other_checkpoints/MCQA_dpo/Quantized"
model = AutoModelForCausalLM.from_pretrained("model_name",  torch_dtype = "auto")
tokenizer = AutoTokenizer.from_pretrained("model_name")
tokenizer.truncation_side='left'

In [None]:
# If it's possible to load mode using unsloth, otherwise use the code above
model, tokenizer = FastLanguageModel.from_pretrained("Other_checkpoints/MCQA_dpo/Quantized", load_in_4bit=False)
print(model.dtype)
tokenizer.truncation_side='left'

In [None]:
print(model.dtype)

In [None]:
import torch
input_file = 'datasets/merged_DPO_test.jsonl'
data = utils.read_jsonl(input_file)
print(len(data))
data = data[:512] # limit the number of examples to 512

In [None]:
# Removbe the chat template from the data
def remove_before_and_including_substring(original_string, substring):
    # Find the index of the first occurrence of the substring
    index = original_string.find(substring)

    # Check if the substring is found
    if index != -1:
        # Slice the string to remove characters before and including the substring
        result_string = original_string[index + len(substring):]
    else:
        # If the substring is not found, return the original string
        result_string = original_string

    return result_string

def process_list_of_strings(string_list, substring):
    return [remove_before_and_including_substring(s, substring) for s in string_list]

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
instruction = "You are an experienced teacher who answers the STEM-related question asked by a student below."
eos_token = tokenizer.eos_token


def generate_prediction(prompts, model=model, tokenizer=tokenizer, has_chat_template=True, base=False):
    if has_chat_template:
      prompts = [f"<|system|>\n{instruction}{eos_token}\n<|user|>\n{prompt.rstrip()}{eos_token}\n<|assistant|>\n" for prompt in prompts]

    inputs = tokenizer(prompts, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    outputs = model.generate(
    **inputs,
    max_length=1024,
    )

    predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    #Remove the prompt from the prediction
    if base: # If the model is the base we can use the removeprefix method
      predictions = [prediction.removeprefix(prompt) for prediction, prompt in zip(predictions, prompts)]
    else: # Otherwise we use the remove_before_and_including_substring method
      predictions = process_list_of_strings(predictions, "<|assistant|>\n")

    return predictions

# Generate predictions and save them
batch_size = 64
output_data = []
print(f"Generating predictions for {len(data)} entries...")

for i in range(0, len(data), batch_size):
    batch = data[i:i+batch_size]
    prompts = [entry['prompt'] for entry in batch]
    chosen = [entry['chosen'] for entry in batch]
    predictions = generate_prediction(prompts, model=model, tokenizer=tokenizer, has_chat_template=True)

    for j in range(len(batch)):
        output_data.append({
            "prompt": prompts[j],
            "chosen": chosen[j],
            "prediction": predictions[j]
        })
    print(f"Batch {i//batch_size + 1} done")

# Save the generated data to a new JSONL file
output_file = 'datasets/Quant_2.jsonl'
utils.write_jsonl(output_data, output_file)

In [None]:
def evaluate_model(model_preditions_gold, scores=["bleu", "bertscore", "rouge", "meteor"]):
    # Load the predictions from the base model

    # Load the predictions from the finetuned model
    model_preditions_gold = utils.read_jsonl(model_preditions_gold)
    model_predictions = [entry['prediction'] for entry in model_preditions_gold]
    model_gold = [entry['chosen'] for entry in model_preditions_gold]

    # Calculate the scores
    scores_dict = {}
    for score in scores:
        scoring_method = evaluate.load(score)
        if score == "bertscore":
            results = scoring_method.compute(predictions=model_predictions, references=model_gold, lang="en")
            results = {'precision' : sum(results['precision'])/len(results['precision']),
                       'recall' : sum(results['recall'])/len(results['recall']),
                       'f1' : sum(results['f1'])/len(results['f1'])}
            scores_dict[score] = results
        else:
            scores_dict[score] = scoring_method.compute(predictions=model_predictions, references=model_gold)

    return scores_dict

#base = evaluate_model("datasets/Base_predictions.jsonl", scores=["bleu", "bertscore", "rouge", "meteor"])
#print(base)
#epfllama = evaluate_model("datasets/EPFLLaMA_model_predictions.jsonl", scores=["bleu", "bertscore", "rouge", "meteor"])
#print(epfllama)
#epfllama_mcqa_quant = evaluate_model("datasets/EPFLLaMA_MCQA_Quantized.jsonl", scores=["bleu", "bertscore", "rouge", "meteor"])
#print(epfllama_mcqa_quant)
tinyllama_chat = evaluate_model("datasets/TinyLlama_chat_predictions.jsonl", scores=["bleu", "bertscore", "rouge", "meteor"])
print(tinyllama_chat)
