In [None]:
import os
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from util.util_prompt import prepare_data_id_few_shot
from util.util_compute import predict_classification_causal as predict_classification
from peft import PeftModel

In [None]:
def evaluate(datasets:list, 
             model_config:dict,
             output_folder:str, 
             model_lora_path:str = None,
             ):
    # load model and tokenizer
    model_name = model_config['pretrained_model_name_or_path']
    device_map = model_config['device_map']
    base_model = AutoModelForCausalLM.from_pretrained(
        **model_config,
        trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        truncation_side='left',
        padding_side='right'
    )
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # load LoRA weight (optional)
    if model_lora_path is not None:
        model = PeftModel.from_pretrained(base_model, model_lora_path)
    else:
        model = base_model
    model.eval()

    # evaluate
    for dataset in datasets:
        # === Output Folder ===
        os.makedirs(f'results_downstream/{output_folder}_{dataset}', exist_ok=True)
        if isinstance(model, PeftModel):
            SAVE_FILE = f'results_downstream/{output_folder}_{dataset}/result_prompt_{model_name.split("/")[-1]}-FT.csv' # FT = Fine-tuned
        else:
            SAVE_FILE = f'results_downstream/{output_folder}_{dataset}/result_prompt_{model_name.split("/")[-1]}.csv'
        # === Prepare Dataset ===
        inputs, answers, number_options = prepare_data_id_few_shot(dataset)
        
        # === Run Predictions ===
        preds, probs = [], []
        with torch.inference_mode():
            for idx in tqdm(range(len(inputs))):
                conf, pred = predict_classification(model, tokenizer, inputs[idx], number_options[idx], device_map)
                probs.append(conf)
                preds.append(pred)  # expected to be 'A', 'B', etc.
        
        # === Save Results ===
        df = pd.DataFrame({
            'input': inputs,
            'answers': answers,
            'number_options': number_options,
            'preds': preds,
            'probs': probs
        })
        df.to_csv(SAVE_FILE, index=False)
        print(f"Saved to {SAVE_FILE}")

In [None]:
# common directory
MODEL_LORA_DIR = 'finetuning'
OUTPUT_FOLDER = 'output'

datasets = ['copal', 'maps', 'indocloze', 'indoculture', 'indommlu', 'indocareer']

In [None]:
# Evaluate Sailor2-8B-Chat
pretrained_model_name_or_path = 'sail/Sailor2-8B-Chat'
torch_dtype = torch.float16
device_map = 'cuda:0'
model_config = {'pretrained_model_name_or_path':pretrained_model_name_or_path,
                'device_map':device_map,
                'torch_dtype':torch_dtype}
evaluate(datasets, model_config, OUTPUT_FOLDER, None)

# Evaluate Sailor2-8B-Chat-FT (finetuned model)
model_lora_path = os.path.join(MODEL_LORA_DIR, 'sail_Sailor2-8B-Chat_2025-05-09_08-58-25', 'model')
evaluate(datasets, model_config, OUTPUT_FOLDER, model_lora_path)

In [None]:
# Evaluate SeaLLMs-v3-7B-Chat
pretrained_model_name_or_path = 'SeaLLMs/SeaLLMs-v3-7B-Chat'
torch_dtype = torch.float16
device_map = 'cuda:0'
model_config = {'pretrained_model_name_or_path':pretrained_model_name_or_path,
                'device_map':device_map,
                'torch_dtype':torch_dtype}
evaluate(datasets, model_config, OUTPUT_FOLDER, None)

# Evaluate SeaLLMs-v3-7B-Chat-FT (finetuned model)
model_lora_path = os.path.join(MODEL_LORA_DIR, 'SeaLLMs_SeaLLMs-v3-7B-Chat_2025-07-01_13-11-03', 'model')
evaluate(datasets, model_config, OUTPUT_FOLDER, model_lora_path)

In [None]:
# Evaluate Llama-SEA-LION-v3-8B-IT
pretrained_model_name_or_path = 'aisingapore/Llama-SEA-LION-v3-8B-IT'
torch_dtype = torch.float16
device_map = 'cuda:0'
model_config = {'pretrained_model_name_or_path':pretrained_model_name_or_path,
                'device_map':device_map,
                'torch_dtype':torch_dtype}
evaluate(datasets, model_config, OUTPUT_FOLDER, None)

# Evaluate Llama-SEA-LION-v3-8B-IT-FT (finetuned model)
model_lora_path = os.path.join(MODEL_LORA_DIR, 'aisingapore_Llama-SEA-LION-v3-8B-IT_2025-07-01_15-34-59', 'model')
evaluate(datasets, model_config, OUTPUT_FOLDER, model_lora_path)

### Sample for low end GPU

In [None]:
# # Evaluate Qwen2.5-0.5B-Instruct
# pretrained_model_name_or_path = 'Qwen/Qwen2.5-0.5B-Instruct'
# torch_dtype = torch.float16
# device_map = 'cuda:0'
# model_config = {'pretrained_model_name_or_path':pretrained_model_name_or_path,
#                 'device_map':device_map,
#                 'torch_dtype':torch_dtype}
# # evaluate(datasets, model_config, OUTPUT_FOLDER, None)

# # Qwen2.5-0.5B-Instruct-FT (finetuned model)
# model_lora_path = os.path.join(MODEL_LORA_DIR, 'Qwen_Qwen2.5-0.5B-Instruct_2025-11-03_20-40-59', 'model')
# evaluate(datasets, model_config, OUTPUT_FOLDER, model_lora_path)