# Evaluation Model 1 
## Model name : unsloth/mistral-7b-v0.3
## Link Colab : [Click](https://colab.research.google.com/drive/1aR5De0sDeIAD9bqISREKv487LUav7gwF?usp=sharing)
## Link Adapter weight : [Click](https://www.kaggle.com/datasets/farhanwew/lora-adapter-1)

# Download Dependencies

In [None]:
%%capture
! pip install unsloth
! pip install rouge_score
! pip install evaluate

In [None]:
%%capture
! pip install -U peft
! pip install bert_score

# Import Dependencies

In [None]:
from tqdm import tqdm
import kagglehub
import os

In [None]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
import evaluate

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
from datasets import load_dataset
import evaluate
import torch
import bert_score
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import os
import re

In [None]:
from datasets import load_dataset, DatasetDict
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import Dataset

In [None]:
from peft import PeftModel

# Load dataset

In [None]:
# 1. Load dataset
dataset = load_dataset("farwew/DoctorsAnswerTextDataset-in-Indonesian")

In [None]:
# 3. Ambil data test
dataset_eval = dataset["test"]

# 4. Konversi ke pandas DataFrame
df = dataset_eval.to_pandas()
df.head()

In [None]:
# 5. Stratified split berdasarkan kolom 'topic_set'
train_df, stratified_sample_df = train_test_split(
    df,
    test_size=150,  # ubah sesuai proporsi yang diinginkan
    stratify=df["topic_set"],
    random_state=42
)

In [None]:
train_dataset = Dataset.from_pandas(train_df)
stratified_dataset = Dataset.from_pandas(stratified_sample_df)

In [None]:
stratified_dataset

# Load Adapter

In [None]:
path = kagglehub.dataset_download("farhanwew/lora-adapter-1")
print("Path to dataset files:", path)
print(os.listdir(path + '/content'))
full_path = path + '/content/lora_model'

#  Notebook Evaluasi Base Model (tanpa Adapter LoRA)

In [None]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-v0.3", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
model.eval()

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [None]:
EOS_TOKEN = tokenizer.eos_token

def formatting_func(examples):
    return {
        "text": [
            alpaca_prompt.format(ins, inp, out) + EOS_TOKEN
            for ins, inp, out in zip(examples["instruction"], examples["input"], examples["output"])
        ]
    }

## Inference dengan base model

In [None]:
inputs = tokenizer(
    [
        alpaca_prompt.format(
            "mencegah mual",
            "Anak saya mengalami mual, bagaimana mencegah mual?",
            ""
        )
    ],
    return_tensors="pt"
).to("cuda")

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

## Evaluation

In [None]:
# === Evaluasi === #
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

generated_outputs = []
reference_outputs = []
all_instructions = []
all_inputs = []

In [None]:
def normalize_text(text):
    text = text.lower().strip()
    text = re.sub(r"\p{P}+", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

In [None]:
num = 1
for example in tqdm(stratified_dataset, desc="Evaluating Base Model"):
    prompt = alpaca_prompt.format(example["instruction"], example["input"], "")
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        output = model.generate(
            **input_ids,
            max_new_tokens=1024,
            do_sample=True,
            top_p=0.9,
            temperature=0.7
        )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    response = decoded.split("### Response:")[-1].strip()

    if num < 10:
        print(prompt)
        print()
        print('response:',response)
    
    generated_outputs.append(response)
    reference_outputs.append(example["output"])
    all_instructions.append(example["instruction"])
    all_inputs.append(example["input"])
    num+=1

In [None]:
# === Skor Evaluasi === #
P, R, F1 = bert_score.score(generated_outputs, reference_outputs, lang="id", device="cuda")
bleu_score = bleu.compute(predictions=generated_outputs, references=[[r] for r in reference_outputs])
rouge_score = rouge.compute(predictions=generated_outputs, references=reference_outputs)

timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
output_dir = f"eval_basemodel_{timestamp}"
os.makedirs(output_dir, exist_ok=True)

In [None]:
# === Simpan Hasil Evaluasi === #
df = pd.DataFrame({
    "instruction": all_instructions,
    "input": all_inputs,
    "reference": reference_outputs,
    "generated": generated_outputs,
    "bert_precision": P.tolist(),
    "bert_recall": R.tolist(),
    "bert_f1": F1.tolist(),
})

df.to_csv(f"{output_dir}/base_model_outputs.csv", index=False)

metrics = {
    "bert_score": {
        "precision": round(P.mean().item() * 100, 2),
        "recall": round(R.mean().item() * 100, 2),
        "f1": round(F1.mean().item() * 100, 2)
    },
    "bleu": round(bleu_score["bleu"] * 100, 2),
    "rouge": {
        "rouge1": round(rouge_score["rouge1"] * 100, 2),
        "rouge2": round(rouge_score["rouge2"] * 100, 2),
        "rougeL": round(rouge_score["rougeL"] * 100, 2)
    }
}

with open(f"{output_dir}/metrics.json", "w") as f:
    import json; json.dump(metrics, f, indent=4)

print("=== EVALUASI BASE MODEL SELESAI ===")
print(metrics)

In [None]:
hasil_base = pd.read_csv('/kaggle/working/eval_basemodel_20250525_013945/base_model_outputs.csv')
hasil_base

# Dengan Adapter

In [None]:
import torch
import gc

# Clear PyTorch cache
torch.cuda.empty_cache()

# Garbage collection
gc.collect()

In [None]:
MAX_SEQ_LENGTH = 2048
USE_4BIT = True

# === LOAD LoRA MODEL === #
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/kaggle/input//content/lora_model",
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = None,
    load_in_4bit = USE_4BIT,
)
FastLanguageModel.for_inference(model)

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
    "mencegah mual",
    "Anak saya mengalami mual, bagaimana mencegah mual?",
    ""
)
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 512)

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
    "Selalu tersedak setiap makan dan minum",
    "Selalu tersedak setiap makan dan minum. Dok saya mau tanya... Kenapa kalau saya makan atau minum selalu tersedak.. Akhirnya susah nafas.. Dada sangat sesak.... Hampir setiap makan saya tersedak...",
    ""
)
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 512)

In [None]:
# === IMPORT TAMBAHAN === #
import bert_score
import pandas as pd
import json
from datetime import datetime
import os

# === LOAD METRIK EVALUASI === #
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

generated_outputs = []
reference_outputs = []
all_inputs = []  # Untuk menyimpan input
all_instructions = []  # Untuk menyimpan instruction

# === PARAMETER === #
batch_size = 1  # Sesuaikan dengan kapasitas GPU
MAX_DISPLAY = 10
displayed = 0

# === TIMESTAMP UNTUK NAMA FILE === #
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
output_dir = f"evaluation_results_{timestamp}"
os.makedirs(output_dir, exist_ok=True)

In [None]:
stratified_dataset

In [None]:
num = 1
for example in tqdm(stratified_dataset, desc="Evaluating Fine tune Model"):
    prompt = alpaca_prompt.format(example["instruction"], example["input"], "")
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        output = model.generate(
            **input_ids,
            max_new_tokens=1024,
            do_sample=True,
            top_p=0.9,
            temperature=0.7
        )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    response = decoded.split("### Response:")[-1].strip()

    if num < 10:
        print(prompt)
        print()
        print('response:',response)
    
    generated_outputs.append(response)
    reference_outputs.append(example["output"])
    all_instructions.append(example["instruction"])
    all_inputs.append(example["input"])
    num+=1

In [None]:
# === Skor Evaluasi === #
P, R, F1 = bert_score.score(generated_outputs, reference_outputs, lang="id", device="cuda")
bleu_score = bleu.compute(predictions=generated_outputs, references=[[r] for r in reference_outputs])
rouge_score = rouge.compute(predictions=generated_outputs, references=reference_outputs)

timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
output_dir = f"eval_finetuned_{timestamp}"
os.makedirs(output_dir, exist_ok=True)

In [None]:
# === Simpan Hasil Evaluasi === #
df = pd.DataFrame({
    "instruction": all_instructions,
    "input": all_inputs,
    "reference": reference_outputs,
    "generated": generated_outputs,
    "bert_precision": P.tolist(),
    "bert_recall": R.tolist(),
    "bert_f1": F1.tolist(),
})

df.to_csv(f"{output_dir}/fine_tuned_outputs.csv", index=False)

metrics = {
    "bert_score": {
        "precision": round(P.mean().item() * 100, 2),
        "recall": round(R.mean().item() * 100, 2),
        "f1": round(F1.mean().item() * 100, 2)
    },
    "bleu": round(bleu_score["bleu"] * 100, 2),
    "rouge": {
        "rouge1": round(rouge_score["rouge1"] * 100, 2),
        "rouge2": round(rouge_score["rouge2"] * 100, 2),
        "rougeL": round(rouge_score["rougeL"] * 100, 2)
    }
}

with open(f"{output_dir}/metrics.json", "w") as f:
    import json; json.dump(metrics, f, indent=4)

print("=== EVALUASI Fine tune MODEL SELESAI ===")
print(metrics)

In [None]:
^c