In [16]:
import torch
import evaluate
import os

def load_texts(file_path):
    """
    Load texts from a file and return as a list of stripped strings.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f.readlines()]

def main():
    # Paths to your data files
    source_path = 'D3test/ikatdata.source'
    target_path = 'D3test/ikatdata.target'
    generated_hypo_path = 'D3test/ikatdata.hypo'
    
    # Check if files exist
    for path in [source_path, target_path, generated_hypo_path]:
        if not os.path.exists(path):
            raise FileNotFoundError(f"File not found: {path}")

    # Load the datasets
    source_texts = load_texts(source_path)
    target_texts = load_texts(target_path)
    generated_hypo_texts = load_texts(generated_hypo_path)

    # Validate that all datasets have the same number of samples
    if not (len(source_texts) == len(target_texts) == len(generated_hypo_texts)):
        raise ValueError("Source, target, and generated hypothesis files must have the same number of lines.")

    # Initialize metrics
    bleu = evaluate.load('bleu')
    bertscore = evaluate.load('bertscore')

    # Compute BLEU-1
    bleu1 = bleu.compute(
        predictions=generated_hypo_texts,
        references=[[ref] for ref in target_texts],
        max_order=1
    )
    
    # Compute BLEU-2
    bleu2 = bleu.compute(
        predictions=generated_hypo_texts,
        references=[[ref] for ref in target_texts],
        max_order=2
    )
    
    # Compute BERTScore
    bs = bertscore.compute(
        predictions=generated_hypo_texts,
        references=target_texts,
        lang='en',
        model_type='bert-base-uncased'
    )

    # Display the results
    print("\n=== Evaluation Results ===")
    print(f"BLEU-1 Score: {bleu1['bleu'] * 100:.2f}")
    print(f"BLEU-2 Score: {bleu2['bleu'] * 100:.2f}")
    print(f"BERTScore Precision: {sum(bs['precision']) / len(bs['precision']) * 100:.2f}")
    print(f"BERTScore Recall: {sum(bs['recall']) / len(bs['recall']) * 100:.2f}")
    print(f"BERTScore F1: {sum(bs['f1']) / len(bs['f1']) * 100:.2f}")

if __name__ == "__main__":
    main()



=== Evaluation Results ===
BLEU-1 Score: 79.76
BLEU-2 Score: 76.58
BERTScore Precision: 88.89
BERTScore Recall: 89.72
BERTScore F1: 89.26
