# Research candidate LLMs for RAG

Based on the findings in our EDA, we decided that we will finetune an LLM model for question answering, in this notebook we will explore some alternatives (restricted by the current available GPU for development RTX 3060ti). We will analyse and explore alternatives and best configurations (quantization, PEFT training)

In [6]:
from transformers import pipeline
import pandas as pd
import evaluate
from bert_score import score as bertscore
import numpy as np
from typing import Callable
import mlflow

# define evaluation metrics

In [8]:
def compute_evaluations(preds, targets):
    rouge = evaluate.load("rouge")
    rouge_scores = rouge.compute(predictions=preds, references=targets, use_stemmer=True)

    bertscore = evaluate.load("bertscore")
    P, R, F1, _ = bertscore.compute(predictions=preds, references=targets, lang="en", model_type="distilbert-base-uncased").values()
    
    bertscore_avg = {
        "bertscore_precision": np.array(P).mean().item(),
        "bertscore_recall": np.array(R).mean().item(),
        "bertscore_f1": np.array(F1).mean().item(),
    }
    
    return {**rouge_scores, **bertscore_avg}

In [11]:
m = compute_evaluations(["if you have the flu you need to rest"], ["rest is the best for the flu"])

In [13]:
m

{'rouge1': np.float64(0.375),
 'rouge2': np.float64(0.14285714285714288),
 'rougeL': np.float64(0.25),
 'rougeLsum': np.float64(0.25),
 'bertscore_precision': 0.744418740272522,
 'bertscore_recall': 0.7576205730438232,
 'bertscore_f1': 0.7509616613388062}

In [15]:
m.items()

dict_items([('rouge1', np.float64(0.375)), ('rouge2', np.float64(0.14285714285714288)), ('rougeL', np.float64(0.25)), ('rougeLsum', np.float64(0.25)), ('bertscore_precision', 0.744418740272522), ('bertscore_recall', 0.7576205730438232), ('bertscore_f1', 0.7509616613388062)])

In [14]:
rouge1, rouge2, rougeL, rougeLsum, bertscore_precision,bertscore_recall, bertscore_f1= m.values()

In [37]:
compute_evaluations(["if you have the flu you need to rest"], ["I like pizza"])

{'rouge1': np.float64(0.0),
 'rouge2': np.float64(0.0),
 'rougeL': np.float64(0.0),
 'rougeLsum': np.float64(0.0),
 'bertscore_precision': 0.6455554366111755,
 'bertscore_recall': 0.6872128844261169,
 'bertscore_f1': 0.6657330989837646}

In [None]:

bertscore = evaluate.load("bertscore")
predictions = ["hello there", "general kenobi", "if you have the flu you need to rest"]
references = ["hello there", "general kenobi", "I like pizza but I dont like mangos"]
P, R, F1, _  = bertscore.compute(predictions=predictions, references=references, lang="en", idf=True, model_type="distilbert-base-uncased").values()
P, R, F1, _ 

([1.000000238418579, 1.000000238418579, 0.6719009876251221],
 [1.000000238418579, 1.000000238418579, 0.6600361466407776],
 [1.000000238418579, 1.000000238418579, 0.6659157276153564],
 'distilbert-base-uncased_L5_idf_version=0.3.12(hug_trans=4.53.1)')

#### Note:
as we can see here we can not blindly trust bert-scored specially on small inputs but it should still be a good metrict to help us compare semantic similarity

# test some models

In [10]:
val_df = pd.read_parquet("../data/cleaned/validation_dataset.parquet")
val_df.head(5)

Unnamed: 0,question,answer,answer_words,valid_question,valid_answer
910,what research (or clinical trials) is being do...,New types of treatment are being tested in cli...,249,True,True
1692,What is (are) Leg Injuries and Disorders ?,"Your legs are made up of bones, blood vessels,...",96,True,True
2722,How to diagnose Axenfeld-Rieger syndrome type 1 ?,Is genetic testing available for Axenfeld Rieg...,64,True,True
239,What is (are) Kidney Disease ?,"When your kidneys fail, they are no longer abl...",82,True,True
1334,What is (are) Ovarian Germ Cell Tumors ?,Key Points\n - Ovarian germ...,237,True,True


In [None]:
candidate_models = [#("text2text-generation", "google/flan-t5-base"), 
                    ("text-generation","unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"), 
                    ("text-generation", "unsloth/Phi-3.5-mini-instruct")]

for task, model_id in candidate_models:
    qa_model = pipeline(task, model=model_id)
    if task == "text-generation":
        input = [{"role": "user", "content": "What is the flu?"}]
    else:
        input = "What is the flu?"
    response = qa_model(input)[0]["generated_text"]
    print(f"model: {model_id} -> response: {response}")

In [16]:
import os

def evaluate_qa_models(
    df: pd.DataFrame,
    qa_model: Callable[[str], str],
    experiment_name: str,
    model_name: str,
    mlflow_uri: str = None,
):
    """
    Runs QA model over df and logs EM & F1 metrics to MLflow.
    """
    if mlflow_uri:
        mlflow.set_tracking_uri(mlflow_uri)
    mlflow.set_experiment(experiment_name)

    with mlflow.start_run(run_name=model_name):
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("num_examples", len(df))

        preds = []
        truths = []
        for _, row in df.iterrows():
            q = row["question"]
            input = [{"role": "user", "content": q}]
            truth = row["answer"]
            pred = qa_model(input)
            preds.append(pred)
            truths.append(truth)

        metrics = compute_evaluations(preds, truths)
        mlflow.log_metrics(metrics)

        # log predictions and truth as artifact for inspection
        out_df = df.copy()
        out_df["predicted"] = preds


        os.makedirs("mlflow_artifacts", exist_ok=True)
        csv_path = "mlflow_artifacts" / f"{model_name}_predictions.csv"
        out_df.to_csv(csv_path, index=False)
        mlflow.log_artifact(str(csv_path), artifact_path="predictions")


In [None]:
candidate_models = ["unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", 
                    "unsloth/Phi-3.5-mini-instruct"]

for model_id in candidate_models:
    qa_model = pipeline("text-generation", model=model_id)
    evaluate_qa_models(
        df = val_df.head(10), 
        qa_model=qa_model,
        experiment_name="initial comparision",
        model_name=model_id
        )