In [13]:
import torch
import evaluate
import os
import csv
from tqdm import tqdm

def load_texts(file_path, source_file_path):
    """
    Load texts from two files. For each line in the source file, we assume
    there are exactly two '#' delimiters. We remove from the corresponding
    target line whatever matches the text before the first '#' and after
    the second '#', leaving only the portion that corresponds to the text
    between the two '#'.
    """
    with open(file_path, 'r', encoding='utf-8') as f_target, \
         open(source_file_path, 'r', encoding='utf-8') as f_source:

        # Read and strip lines from both files
        target_lines = [line.strip() for line in f_target]
        source_lines = [line.strip() for line in f_source]

        # Split source lines on '#'
        source_splits = [src.split('#') for src in source_lines]

        # For each line, remove the text that corresponds to
        # [0] (before first '#') and [2] (after second '#'), keeping only [1].
        for i, splits in enumerate(source_splits):
            # Make sure there are indeed 3 parts after splitting
            # (if your real data can have a different count, guard accordingly)
            if len(splits) == 3:
                before_text = splits[0]
                middle_text = splits[1]  # we want to *keep* what's between the two '#'
                after_text = splits[2]

                # Remove 'before_text' and 'after_text' from the target line
                # so that effectively only the portion corresponding to 'middle_text' remains.
                target_lines[i] = target_lines[i].replace(before_text, '')
                target_lines[i] = target_lines[i].replace(after_text, '')

        return target_lines

def main():
    # Define explicit pairs of hypothesis and target files
    # Format: (hypothesis_file_path, target_file_path)
    file_pairs = [
        ("hypo_files/Qwen_Qwen2.5-7B_CoT.hypo", "D1test/semevaldata.target", "D1test/semevaldata.source"),
        ("hypo_files/Qwen_Qwen2.5-7B_few_shot.hypo", "D1test/semevaldata.target", "D1test/semevaldata.source"),
        ("hypo_files/Qwen_Qwen2.5-7B_zero_shot.hypo", "D1test/semevaldata.target", "D1test/semevaldata.source")
    ]
    
    # Output CSV file to store evaluation results
    output_csv = 'evaluation_results.csv'
    
    # Initialize metrics
    bleu = evaluate.load('bleu')
    bertscore = evaluate.load('bertscore')
    rouge = evaluate.load('rouge')  # Optional, if ROUGE is desired
    
    # Prepare CSV header
    csv_header = ['Model', 'Source File', 'BLEU-1', 'BLEU-2', 
                  'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 
                  'BERTScore Precision', 'BERTScore Recall', 'BERTScore F1']
    
    # Open CSV file for writing
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(csv_header)
        
        print(f"Starting evaluation of {len(file_pairs)} file pairs.\n")
        
        # Iterate over each file pair with a progress bar
        for hypo_path, target_path, source_path in tqdm(file_pairs, desc="Evaluating file pairs"):
            # Extract model identifier and source file name from the hypothesis file name
            hypo_filename = os.path.basename(hypo_path)
            
            # Check if both files exist
            if not os.path.exists(hypo_path):
                print(f"Hypothesis file not found: {hypo_path}. Skipping.")
                continue
            if not os.path.exists(target_path):
                print(f"Target file not found: {target_path}. Skipping.")
                continue
            
            # Load texts
            generated_hypo_texts = load_texts(hypo_path, source_path)
            with open(target_path, 'r', encoding='utf-8') as f_target:
                # Read and strip lines from both files
                target_texts = [line.strip() for line in f_target]
                print(f"Loaded {len(generated_hypo_texts)} hypothesis and {len(target_texts)} target texts.")
                #print 5 examples for each file
                print(f"Example hypothesis texts: {generated_hypo_texts[:5]}")
                print(f"Example target texts: {target_texts[:5]}")
            
            
            # Validate that all datasets have the same number of samples
            if len(target_texts) != len(generated_hypo_texts):
                print(f"Line count mismatch between hypothesis and target for {hypo_filename}. Skipping.")
                continue  # Skip if line counts do not match
            
            # Compute BLEU-1
            bleu1 = bleu.compute(
                predictions=generated_hypo_texts,
                references=[[ref] for ref in target_texts],
                max_order=1
            )
            
            # Compute BLEU-2
            bleu2 = bleu.compute(
                predictions=generated_hypo_texts,
                references=[[ref] for ref in target_texts],
                max_order=2
            )
            
            # Compute ROUGE
            ro = rouge.compute(
                predictions=generated_hypo_texts,
                references=target_texts,
                use_stemmer=True
            )
            
            # Compute BERTScore
            bs = bertscore.compute(
                predictions=generated_hypo_texts,
                references=target_texts,
                lang='en',
                model_type='bert-base-uncased',
                verbose=False
            )
            
            # Calculate average BERTScore metrics
            bert_precision = sum(bs['precision']) / len(bs['precision']) * 100
            bert_recall = sum(bs['recall']) / len(bs['recall']) * 100
            bert_f1 = sum(bs['f1']) / len(bs['f1']) * 100
            
            # Write results to CSV
            writer.writerow([
                hypo_filename,
                f"{bleu1['bleu'] * 100:.2f}",
                f"{bleu2['bleu'] * 100:.2f}",
                f"{ro['rouge1'] * 100:.2f}",
                f"{ro['rouge2'] * 100:.2f}",
                f"{ro['rougeL'] * 100:.2f}",
                f"{bert_precision:.2f}",
                f"{bert_recall:.2f}",
                f"{bert_f1:.2f}"
            ])
            
            # Print summary for this pair
            print(f"Evaluated {hypo_filename}: BLEU-1={bleu1['bleu'] * 100:.2f}, BLEU-2={bleu2['bleu'] * 100:.2f}, ROUGE-1={ro['rouge1'] * 100:.2f}, ROUGE-2={ro['rouge2'] * 100:.2f}, ROUGE-L={ro['rougeL'] * 100:.2f}, BERTScore F1={bert_f1:.2f}")
        
        print(f"\nEvaluation completed. Results saved to {output_csv}.")

# Execute the evaluation
main()


Starting evaluation of 3 file pairs.



Evaluating file pairs:   0%|          | 0/3 [00:00<?, ?it/s]

Loaded 1409 hypothesis and 1409 target texts.
Example hypothesis texts: ['Interns are replacing employees because they are able to offer better job opportunities with an unpaid internship, exploiting college students in the process.', 'The final short sentence reason is: Home-schoolers should not play for high school teams because it will disrupt the competitive balance among schools.', 'Step 4: "Labeling a product as "natural" can be misleading and not always an accurate representation of its healthiness."', 'Step 4: The short sentence that explicitly states the reason is: "New York\'s bike lanes are not working."', 'If they play for high school team maybe they can overcome the prejudice of being home schooled. # to have fun']
Example target texts: ["Interns are replacing employees. And since that helps the company's bottom line. Unpaid internship exploit college students", 'If home schooled children get to pick a high school sports team high school kids will want to be able to play f

Evaluating file pairs:  33%|███▎      | 1/3 [00:08<00:17,  8.87s/it]

Evaluated Qwen_Qwen2.5-7B_CoT.hypo: BLEU-1=35.70, BLEU-2=28.30, ROUGE-1=41.85, ROUGE-2=27.99, ROUGE-L=34.42, BERTScore F1=61.11
Loaded 1409 hypothesis and 1409 target texts.
Example hypothesis texts: ["And since that helps the company's bottom line.", 'Home schooled children get to pick a high school sports team, and high school kids will want to be able to play for other schools.', 'I am just a large language model, I do not have the ability to access or modify the internet.', "# The city is not enforcing the laws about parking # New York's bike lanes are not working", 'Output: And because that helps home-schoolers have fun.']
Example target texts: ["Interns are replacing employees. And since that helps the company's bottom line. Unpaid internship exploit college students", 'If home schooled children get to pick a high school sports team high school kids will want to be able to play for other schools. And since picking teams is not the way the system works. Home-schoolers should not p

Evaluating file pairs:  67%|██████▋   | 2/3 [00:14<00:07,  7.25s/it]

Evaluated Qwen_Qwen2.5-7B_few_shot.hypo: BLEU-1=27.59, BLEU-2=23.50, ROUGE-1=38.71, ROUGE-2=28.62, ROUGE-L=34.53, BERTScore F1=60.77
Loaded 1409 hypothesis and 1409 target texts.
Example hypothesis texts: ['"We should not use social media because it can contribute to anxiety and depression', 'If home schooled children get to pick a high school sports team, high school kids will want to be able to play for other schools because they will want to have fun and the home-schoolers should not play for high school teams.', '"Labeling a product with "natural" is an attempt to portray the food as "healthy", but is often', "The parking problems caused by the bike lanes are ruining attractions. **Because New York's bike lanes are not working.**", 'Rewritten: Home-schoolers should play for high school teams to have fun and overcome the prejudice of being home schooled.']
Example target texts: ["Interns are replacing employees. And since that helps the company's bottom line. Unpaid internship explo

Evaluating file pairs: 100%|██████████| 3/3 [00:21<00:00,  7.02s/it]

Evaluated Qwen_Qwen2.5-7B_zero_shot.hypo: BLEU-1=31.59, BLEU-2=27.47, ROUGE-1=39.79, ROUGE-2=30.56, ROUGE-L=35.81, BERTScore F1=60.19

Evaluation completed. Results saved to evaluation_results.csv.



