# This Notebook contains two main functions

The first can be used when the eval function is run on the same device where the model got trained
The second when the eval function is run on a device that needs to download the iterations from huggingface first

In [None]:
import pandas as pd
from datasets import load_dataset
from core.run_evaluation import eval_function
from unsloth import FastLanguageModel
import torch
import os

##First load the dataset that is needed for testing 
test_dataset = load_dataset('chris7374/esg-net-zero', revision='test')
df = test_dataset['train'].to_pandas()

In [None]:
#Just For Gemma2
def generate_response(text : str) -> str:
    messages = [
        {"role": "user", "content": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n" + text},
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        tokenize=True
    )
    outputs = model.generate(inputs, max_new_tokens = 500, use_cache = True, pad_token_id=tokenizer.pad_token_id, temperature=0)
    response_tokens = outputs[0][inputs.shape[-1]:]
    response = tokenizer.decode(response_tokens, skip_special_tokens=True)
    return response

In [None]:
#Prompt
instruction = """You are an expert ESG (Environmental, Social, and Governance) analyst who conducts ESG research by analyzing texts to identify the presence of climate balance targets. Your primary task is to classify identified targets into one of four predefined classes and determine the target year for the climate balance target. Only consider overall climate balance targets, meaning that they are company-wide.
The possible classes are “Carbon neutral(ity)”, “Emissions reduction target”, “Net zero”, and “No target”.
Each class has equal importance, and the correct classification should reflect the most explicit target mentioned in the text. In cases where multiple classes are present:
	•	“Net zero” should only be prioritized if explicitly mentioned as a company’s overarching target.
	•	“Carbon neutral(ity)” takes precedence over “Emissions reduction target” only if it is the primary focus of the text.
	•	“Emissions reduction target” should be classified if it is directly stated and not overshadowed by “Net zero” or “Carbon neutral(ity)” commitments.
	•	If no explicit target is mentioned, classify as “No target”.
Ensure the classification is based on explicit information from the text, without assuming that one target implies another unless clearly stated."""
example = """"""

In [None]:
##If Folders are already on the device
max_seq_length = 8192 
dtype = None # None for auto detection. 
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# Path to the local folder containing the model checkpoints
local_model_folder = "models"

# List of checkpoint folders to check
checkpoints_to_check = [
     'checkpoint-105',
] ## Keep empty for checking every checkpoint

# Get a list of all checkpoint folders in the local model folder
checkpoint_folders = [f for f in os.listdir(local_model_folder) if os.path.isdir(os.path.join(local_model_folder, f))]

for checkpoint_folder in checkpoint_folders:
    if len(checkpoints_to_check) > 0 and checkpoint_folder not in checkpoints_to_check:
        continue
    try:
        model_path = os.path.join(local_model_folder, checkpoint_folder)
        print(f"Checkpoint: {model_path}")
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_path,
            max_seq_length=max_seq_length,
            dtype=dtype,
            load_in_4bit=load_in_4bit,
            device_map='auto'
        )
        FastLanguageModel.for_inference(model=model)
        # Rest of your code remains the same
        eval_function(instruction=instruction,example=example, iteration=checkpoint_folder, ground_truth_dataframe=df,target_column='end_target', target_year_column='end_target_year', context_column='custom_text', prompt_structures=['CIEKX'], generate_response=generate_response ,finetuned=True ,save_to_docx=True)
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        # Delete model and tokenizer to free up memory
        del model
        del tokenizer

        # Clear CUDA cache
        if torch.cuda.is_available():
            torch.cuda.empty_cache()


In [None]:
#If the model is online
from huggingface_hub import snapshot_download
from huggingface_hub import HfApi
import shutil
from unsloth import FastLanguageModel
import torch

max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
path="model-to-test"

checkpoints_folder = "checkpoint"  # Replace with your folder path
repo_id = "chris7374/15-10-esgemma2b-3-epoch"  # Your Hugging Face repository ID

checkpoints_to_check=[
    #'checkpoint-105',
    #'checkpoint-52'
] ## Keep empty for checking ever checkpoint

api = HfApi()

# Get all branches for the model
branches = api.list_repo_refs(repo_id)
branches = branches.branches
for branch in branches:
    if(len(checkpoints_to_check) > 0):
        if(branch.ref.split('/')[-1] not in checkpoints_to_check):
            continue
    if branch.ref.split('/')[-1] != 'main':
        try:    
            print(f"Branch: {branch.ref.split('/')[-1]}")
            snapshot_download(repo_id=repo_id, revision=branch.ref.split('/')[-1], local_dir='model-to-test')

            model, tokenizer = FastLanguageModel.from_pretrained(
                path,
                max_seq_length = max_seq_length,
                dtype=dtype,
                load_in_4bit=load_in_4bit,
                device_map='auto'
            )
            FastLanguageModel.for_inference(model=model)
            
            eval_function(instruction=instruction,example=example, iteration=branch.ref.split('/')[-1], ground_truth_dataframe=df,target_column='end_target', target_year_column='end_target_year', context_column='custom_text', prompt_structures=['CIEKX'],generate_response=generate_response ,finetuned=True ,save_to_docx=True)

        except Exception as e:
            print(f"An error occurred: {e}")
        finally:
            # Delete model and tokenizer to free up memory
            del model
            del tokenizer

            # Clear CUDA cache
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            shutil.rmtree('model-to-test')