#### Methods
- evaluate method : using model,tokenizer,valid_data, metrics dict evaluate the 
- method to extract valid data 

#### LOAD DATA

In [2]:
import dill as pickle

def get_validation_data(valid_path='data/dataset_test.pkl', shuffle_seed=1337):

    with open(valid_path, 'rb') as f:
        shuffled_alpaca = pickle.load(f)

    if shuffle_seed is not None:
        shuffled_alpaca = shuffled_alpaca.shuffle(seed=shuffle_seed)

    return shuffled_alpaca

#### LOAD MODEL, TOKENIZER

In [3]:
import pathlib

def extract_path_info(checkpoint_path):
    files_mapping = {}
    for p in pathlib.Path(checkpoint_path).iterdir():
        if p.is_file():
            if "config" in str(p).lower():
                files_mapping["config"] = str(p)
            elif "model" in str(p).lower():
                files_mapping["weights"] = str(p)
            elif "readme" in str(p).lower():
                files_mapping["readme"] = str(p)
            elif "mistral_l" in str(p).lower():
                files_mapping["gens"] = str(p)

    return files_mapping

In [3]:
import json 
from transformers import AutoTokenizer
from peft import AutoPeftModelForCausalLM

def get_model_tokenizer(checkpoint_path="weights/mistral_0_3"):
    files_maping = extract_path_info(checkpoint_path)

    with open(files_maping["config"], 'r') as file:
        loaded_config = json.load(file)

    model_name = loaded_config['base_model_name_or_path']

    tokenizer = AutoTokenizer.from_pretrained(
        model_name
        )
    
    model = AutoPeftModelForCausalLM.from_pretrained(
        checkpoint_path,
        load_in_4bit = True # on my setup this is the only doable load
        )   # dragos can probs pass files_mapping["config"] dirrectly
    
    return model, tokenizer

  from .autonotebook import tqdm as notebook_tqdm


#### Eval for one listed metric

In [4]:
def extract_text_after_substring(full_text, substring_list):

    for sus_str in substring_list:
        index = full_text.find(sus_str)
        if index != -1:
            return full_text[index + len(sus_str):].strip()

In [5]:
def extract_text_excluding_dubs(text):

    def split_into_sentences(text):
        return [sentence.strip() for sentence in text.split('.') if sentence]

    # Split the text into sentences
    sentences = split_into_sentences(text)

    # Identify repeating groups
    for i in range(len(sentences)):
        for j in range(i + 1, len(sentences)):
            group_size = j - i
            if group_size > 0 and sentences[i:i + group_size] == sentences[j:j + group_size]:
                return '. '.join(sentences[:j + group_size]) + '.'
    

    return text


In [6]:
def simple_tokenize(text):
    # Splitting by whitespace and removing punctuation
    tokens = text.split()
    tokens = [token.strip('.,!?;:') for token in tokens]
    return tokens

In [16]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from evaluate import load

rouge = load("rouge")
comet = load("comet")
sari = load("sari")
smoothie = SmoothingFunction().method4

def compute_metric(pred, target, source, metric_name="bleu"):
    if metric_name=="bleu":
        pred = simple_tokenize(pred)
        target = simple_tokenize(target)
        bleu_score = sentence_bleu([target], pred)
        return bleu_score
    elif metric_name=="rouge":
        results = rouge.compute(predictions=[pred],
                         references=[target],
                         rouge_types = ["rougeL"])
        return results["rougeL"]
    elif metric_name=='comet':
        results = comet.compute(
            predictions=[pred],
            references=[target],
            sources=[source])
        return results["mean_score"]
    elif metric_name=='sari':
        results = sari.compute(
            predictions=[pred],
            references=[[target]],
            sources=[source])
        return results["sari"]
    elif metric_name=="bleu_smooth":
        pred = simple_tokenize(pred)
        target = simple_tokenize(target)
        bleu_score = sentence_bleu([target], pred, smoothing_function=smoothie)
        return bleu_score
    

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 56527.01it/s]
Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.1.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../../home/machine73/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
Encoder model frozen.
/home/machine73/anaconda3/envs/MT_P/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:177: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


### Use model to generate an output on each datapoint and evaluate the output

In [8]:
def eval_one_metric(model, tokenizer, eval_data, device, metric, eliminate_dups = True):

    model = model.to(device)
    model.eval()

    metric_norm = len(eval_data)
    metric_value = 0.0

    for data_point in eval_data:

        model_input = tokenizer(
            data_point["text"],
            return_tensors="pt"
            )

        model_out = model.generate(
            input_ids=model_input["input_ids"],
            max_new_tokens=3000 # this could be computed as max_tokens across all outs
            )
        
        out_text = tokenizer.batch_decode(
            model_out.detach().cpu().numpy(),
            skip_special_tokens=True
            )[0] # 0 because we passed a batch of one el

        _out = extract_text_after_substring(
            out_text,
            "The simplified text is:")
        
        if eliminate_dups:
            _out = extract_text_excluding_dubs(_out)

        _target = data_point["output"]

        metric_value += compute_metric(_out, _target, metric)
    
    return metric_value / metric_norm

In [None]:
import torch 

eval_data = get_validation_data(
    valid_path='data/dataset_test.pkl',
    shuffle_seed=1337
    )
model, tokenizer = get_model_tokenizer(
    checkpoint_path="weights/mistral_0_3"
    )
device = torch.device(
    'cuda' if torch.cuda.is_available() else 'cpu'
    )

eval_res = eval_one_metric(
    model,
    tokenizer,
    eval_data,
    device,
    metric="bleu"
)


### Evaluate using data already generated by the model

In [13]:
from tqdm import tqdm 
import re

In [24]:
def eval_one_metric_only_data(checkpoint_path, eval_data, metric=["bleu","bleu_smooth","rouge","comet","sari"], eliminate_dubs=True):
    
    metric_norm = len(eval_data)

    metric_value = {}

    for _m in metric:
        metric_value[_m] = 0.0

    img_tag_pattern = r'<img[^>]+>'
    
    check_maps = extract_path_info(checkpoint_path)

    with open(check_maps["gens"], 'rb') as f:
        generated_outs = pickle.load(f)

    for idx, data_point in enumerate(tqdm(eval_data)):

        _out = extract_text_after_substring(
            generated_outs[idx],
            ["The slightly simplified version is:",
            "The moderately simplified version is:",
            "The very simplified version is:"])
        
        if eliminate_dubs:
            _out = extract_text_excluding_dubs(_out)

        _target = data_point["output"]
        _target = re.sub(img_tag_pattern, '', _target)

        _source = data_point["input"]
        _source = re.sub(img_tag_pattern, '',_source)

        for key,_ in metric_value.items():
            metric_value[key] += compute_metric(
                _out, _target, _source, key)
        
    for k,v in metric_value.items():
        metric_value[k] = v / metric_norm

    return metric_value

In [19]:
import torch 

eval_data = get_validation_data(
    valid_path='data/dataset_mistral_l4_test.pkl',
    shuffle_seed=1337
    )

eval_res = eval_one_metric_only_data(
    "weights/mistral_0_4",
    eval_data,
    metric=["bleu","bleu_smooth","rouge","comet","sari"]
)

eval_res


  0%|          | 0/95 [00:00<?, ?it/s]Using default tokenizer.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  1%|          | 1/95 [00:03<05:17,  3.38s/it]Using default tokenizer.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  2%|▏         | 2/95 [00:06<05:12,  3.36s/it]Using default tokenizer.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  3%|▎         | 3/95 [00:10<05:13,  3.41s/it]Using default tokenizer.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, usin

{'bleu': 0.07439285741186737,
 'bleu_smooth': 0.07550946272469528,
 'rouge': 0.20958208161141675,
 'comet': 0.8322609826138145,
 'sari': 41.16333448459526}

In [20]:
import torch 

eval_data = get_validation_data(
    valid_path='data/dataset_mistral_l3_test.pkl',
    shuffle_seed=1337
    )

eval_res = eval_one_metric_only_data(
    "weights/mistral_0_3",
    eval_data,
    metric=["bleu","bleu_smooth","rouge","comet","sari"]
)

eval_res


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Using default tokenizer.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Using default tokenizer.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
h

{'bleu': 0.010083998493976164,
 'bleu_smooth': 0.01750778651756063,
 'rouge': 0.12864662772115817,
 'comet': 0.6144483986653779,
 'sari': 29.93644804681059}

In [21]:
import torch 

eval_data = get_validation_data(
    valid_path='data/dataset_mistral_l2_test.pkl',
    shuffle_seed=1337
    )

eval_res = eval_one_metric_only_data(
    "weights/mistral_0_2",
    eval_data,
    metric=["bleu","bleu_smooth","rouge","comet","sari"]
)

eval_res


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Using default tokenizer.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independentl

{'bleu': 0.018324115410036132,
 'bleu_smooth': 0.025535361273497547,
 'rouge': 0.14177417560459504,
 'comet': 0.6203303158283233,
 'sari': 23.36814132413904}

In [26]:
import torch 

eval_data = get_validation_data(
    valid_path='data/dataset_mistral_l1_test.pkl',
    shuffle_seed=1337
    )

eval_res = eval_one_metric_only_data(
    "weights/mistral_0_1",
    eval_data,
    metric=["bleu","bleu_smooth","rouge","comet","sari"]
)

eval_res


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Using default tokenizer.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Using default tokenizer.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/machine73/anaconda3/envs/MT_P/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py:54: Detected 

TypeError: 'NoneType' object is not iterable