In [1]:
metric = "ppl" # bleu or bleurt or ppl

In [2]:
import torch
import yaml
from tqdm import tqdm

config_file = '/workspace/ChartQA/config.yaml'
with open(config_file, 'r') as file:
    config = yaml.safe_load(file)
    
evaluation_config = config['evaluation']

generated_file = evaluation_config['generated_file']
target_file = evaluation_config["target_file"]
print(f"Configuration loaded successfully.")
print(f"generated_file={generated_file}")
print(f"target_file={target_file}")

Configuration loaded successfully.
generated_file=/notebooks/evaluation/OWID/Llama-2-7b-hf-c2t-2-005/generated.txt
target_file=/notebooks/evaluation/OWID/Llama-2-7b-hf-c2t-2-005/target.txt


In [3]:
def run_bleu():
    import nltk
    from nltk.translate.bleu_score import corpus_bleu
    
#     trans = bleu.load_translation_corpus(generated_file)
#     refs = bleu.load_reference_corpus([target_file])

#     bleu_score = bleu.bleu_corpus_level(trans, refs, max_order=2)
#     print(f"BLEU score: {bleu_score}")

    def load_file(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.read().splitlines()
        return lines

    generated = load_file(generated_file)
    targets = load_file(target_file)
    
    print(len(generated))
    print(len(targets))

    # Ensure both files have the same number of lines
    assert len(generated) == len(targets)

    # Function to calculate BLEU score
    def calculate_bleu(generated, targets, weights):
        references = [[t.split()] for t in targets]
        hypothesis = [g.split() for g in generated]
        return corpus_bleu(references, hypothesis, weights=weights)

    # Calculate BLEU score

    # Calculate BLEU scores with different n-gram weights
    bleu_1 = calculate_bleu(generated, targets, weights=(1, 0, 0, 0))
    bleu_2 = calculate_bleu(generated, targets, weights=(0.5, 0.5, 0, 0))
    bleu_3 = calculate_bleu(generated, targets, weights=(0.33, 0.33, 0.33, 0))
    bleu_4 = calculate_bleu(generated, targets, weights=(0.25, 0.25, 0.25, 0.25))

    print(f"BLEU-1 score: {bleu_1}")
    print(f"BLEU-2 score: {bleu_2}")
    print(f"BLEU-3 score: {bleu_3}")
    print(f"BLEU-4 score: {bleu_4}")

    # Example usage
    # run_bleu(generated_file, target_file)

    
    
def run_bleurt():
    %cd /notebooks/bleurt
    !pip install .
    
    from bleurt import score as bleurt_score
    
    checkpoint = "/notebooks/bleurt/bleurt/test_checkpoint"  # Change this to your BLEURT checkpoint path
    scorer = bleurt_score.BleurtScorer(checkpoint)

    with open(generated_file, 'r') as gen_f, open(target_file, 'r') as tar_f:
        generated_lines = gen_f.readlines()
        target_lines = tar_f.readlines()

    bleurt_scores = scorer.score(references=target_lines, candidates=generated_lines)
    avg_bleurt_score = sum(bleurt_scores) / len(bleurt_scores)
    print(f"Average BLEURT score: {avg_bleurt_score}")
    
def run_ppl():
    from transformers import GPT2LMHeadModel, GPT2TokenizerFast

    device = "cuda"  # Change to "cpu" if you don't have a GPU
    model_id = "gpt2-large"
    model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
    tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

    def calculate_ppl(text, model, tokenizer, device):
        encodings = tokenizer(text, return_tensors="pt")
        max_length = model.config.n_positions
        stride = 512
        seq_len = encodings.input_ids.size(1)

        nlls = []
        prev_end_loc = 0
        for begin_loc in tqdm(range(0, seq_len, stride), desc="Calculating PPL"):
            end_loc = min(begin_loc + max_length, seq_len)
            trg_len = end_loc - prev_end_loc
            input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
            target_ids = input_ids.clone()
            target_ids[:, :-trg_len] = -100

            with torch.no_grad():
                outputs = model(input_ids, labels=target_ids)
                neg_log_likelihood = outputs.loss

            nlls.append(neg_log_likelihood)
            prev_end_loc = end_loc
            if end_loc == seq_len:
                break

        ppl = torch.exp(torch.stack(nlls).mean())
        return ppl.item()

    with open(generated_file, 'r') as f:
        generated_text = f.read()

    ppl_score = calculate_ppl(generated_text, model, tokenizer, device)
    print(f"Perplexity score: {ppl_score}")

In [4]:
if metric == "bleu":
    run_bleu()
    
if metric == "bleurt":
    run_bleurt()
    
if metric == "ppl":
    run_ppl()

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Calculating PPL:   0%|          | 0/1 [00:00<?, ?it/s]

Perplexity score: 19.15102195739746



