#### Methods
- evaluate method : using model,tokenizer,valid_data, metrics dict evaluate the 
- method to extract valid data 

#### LOAD DATA

In [1]:
import dill as pickle

def get_validation_data(valid_path='data/dataset_test.pkl', shuffle_seed=1337):

    with open(valid_path, 'rb') as f:
        shuffled_alpaca = pickle.load(f)

    if shuffle_seed is not None:
        shuffled_alpaca = shuffled_alpaca.shuffle(seed=shuffle_seed)

    return shuffled_alpaca

#### LOAD MODEL, TOKENIZER

In [2]:
import pathlib

def extract_path_info(checkpoint_path):
    files_mapping = {}
    for p in pathlib.Path(checkpoint_path).iterdir():
        if p.is_file():
            if "config" in str(p).lower():
                files_mapping["config"] = str(p)
            elif "model" in str(p).lower():
                files_mapping["weights"] = str(p)
            elif "readme" in str(p).lower():
                files_mapping["readme"] = str(p)

    return files_mapping

In [3]:
import json 
from transformers import AutoTokenizer
from peft import AutoPeftModelForCausalLM

def get_model_tokenizer(checkpoint_path="weights/mistral_0_3"):
    files_maping = extract_path_info(checkpoint_path)

    with open(files_maping["config"], 'r') as file:
        loaded_config = json.load(file)

    model_name = loaded_config['base_model_name_or_path']

    tokenizer = AutoTokenizer.from_pretrained(
        model_name
        )
    
    model = AutoPeftModelForCausalLM.from_pretrained(
        checkpoint_path,
        load_in_4bit = True # on my setup this is the only doable load
        )   # dragos can probs pass files_mapping["config"] dirrectly
    
    return model, tokenizer

  from .autonotebook import tqdm as notebook_tqdm


#### Eval for one listed metric

In [4]:
def extract_text_after_substring(full_text, substring):

    index = full_text.find(substring)

    if index == -1:
        return ""  

    return full_text[index + len(substring):].strip()

In [5]:
def extract_text_excluding_dubs(text):
    # Function to split text into sentences
    def split_into_sentences(text):
        # This can be more sophisticated depending on sentence complexity
        return [sentence.strip() for sentence in text.split('.') if sentence]

    # Split the text into sentences
    sentences = split_into_sentences(text)

    # Identify repeating groups
    for i in range(len(sentences)):
        # Check for repetition in the rest of the text
        for j in range(i + 1, len(sentences)):
            # Compare groups of sentences
            group_size = j - i
            if group_size > 0 and sentences[i:i + group_size] == sentences[j:j + group_size]:
                # Return text up to and including the first repetition group
                return '. '.join(sentences[:j + group_size]) + '.'
    
    # If no repetition is found, return the original text
    return text


In [6]:
def simple_tokenize(text):
    # Splitting by whitespace and removing punctuation
    tokens = text.split()
    tokens = [token.strip('.,!?;:') for token in tokens]
    return tokens

In [7]:
from nltk.translate.bleu_score import sentence_bleu
from evaluate import load

def compute_metric(pred, target, metric_name="bleu"):
    if metric_name=="bleu":
        pred = simple_tokenize(pred)
        target = simple_tokenize(target)

        bleu_score = sentence_bleu([target], pred)

        return bleu_score
    elif metric_name=="rougeL":
        rouge = load('rouge')
        results = rouge.compute(predictions=[pred],
                         references=[target],
                         rouge_types = ["rougeL"])
        return results
    

### Use model to generate an output on each datapoint and evaluate the output

In [8]:
def eval_one_metric(model, tokenizer, eval_data, device, metric, eliminate_dups = True):

    model = model.to(device)
    model.eval()

    metric_norm = len(eval_data)
    metric_value = 0.0

    for data_point in eval_data:

        model_input = tokenizer(
            data_point["text"],
            return_tensors="pt"
            )

        model_out = model.generate(
            input_ids=model_input["input_ids"],
            max_new_tokens=3000 # this could be computed as max_tokens across all outs
            )
        
        out_text = tokenizer.batch_decode(
            model_out.detach().cpu().numpy(),
            skip_special_tokens=True
            )[0] # 0 because we passed a batch of one el

        _out = extract_text_after_substring(
            out_text,
            "The simplified text is:")
        
        if eliminate_dups:
            _out = extract_text_excluding_dubs(_out)

        _target = data_point["output"]

        metric_value += compute_metric(_out, _target, metric)
    
    return metric_value / metric_norm

In [None]:
import torch 

eval_data = get_validation_data(
    valid_path='data/dataset_test.pkl',
    shuffle_seed=1337
    )
model, tokenizer = get_model_tokenizer(
    checkpoint_path="weights/mistral_0_3"
    )
device = torch.device(
    'cuda' if torch.cuda.is_available() else 'cpu'
    )

eval_res = eval_one_metric(
    model,
    tokenizer,
    eval_data,
    device,
    metric="bleu"
)


### Evaluate using data already generated by the model

In [1]:
def eval_one_metric_only_data(generated_outs, eval_data, metric="bleu", eliminate_dubs=True):
    
    metric_norm = len(eval_data)
    metric_value = 0.0

    for idx, data_point in enumerate(eval_data):

        _out = extract_text_after_substring(
            generated_outs[idx],
            "The simplified text is:")
        
        if eliminate_dubs:
            _out = extract_text_excluding_dubs(_out)

        _target = data_point["output"]

        metric_value += compute_metric(_out, _target, metric)

    return metric_value / metric_norm

In [None]:
import torch 

eval_data = get_validation_data(
    valid_path='data/dataset_test.pkl',
    shuffle_seed=1337
    )

generated_outs = "Insert list of string here"

eval_res = eval_one_metric_only_data(
    generated_outs,
    eval_data,
    metric="bleu"
)
