In [None]:
import sys
import subprocess
import os

# Install the package from parent directory
subprocess.check_call(["pip", "install", "-e", ".."])

### BLEU

In [None]:
from fi.evals.metrics import BLEUScore
from fi.testcases import TestCase

def test_bleu_configurations():
    test_text = "The quick brown fox jumps over the lazy dog"
    reference = "The fast brown fox leaps over the sleepy dog"
    
    # 1. Default configuration (sentence mode)
    default_bleu = BLEUScore()
    
    # 2. Sentence mode with custom n-gram
    bigram_bleu = BLEUScore(config={
        "mode": "sentence",
        "max_n_gram": 2,  # Use only unigrams and bigrams
        "smooth": "method1"
    })
    
    # 3. Corpus mode
    corpus_bleu = BLEUScore(config={
        "mode": "corpus",
        "max_n_gram": 4,
        "smooth": "method1"
    })
    
    # 4. Custom weights for different n-grams
    weighted_bleu = BLEUScore(config={
        "mode": "sentence",
        "weights": [0.4, 0.3, 0.2, 0.1],  # More weight on unigrams, less on 4-grams
        "smooth": "method1"
    })
    
    # 5. Different smoothing method
    smooth2_bleu = BLEUScore(config={
        "mode": "sentence",
        "max_n_gram": 4,
        "smooth": "method2"
    })
    
    test_case = TestCase(
        response=test_text,
        expected_text=reference
    )
    
    configs = {
        "Default": default_bleu,
        "Bigram Only": bigram_bleu,
        "Corpus Mode": corpus_bleu,
        "Custom Weights": weighted_bleu,
        "Method2 Smoothing": smooth2_bleu
    }
    
    results = {}
    for name, evaluator in configs.items():
        result = evaluator.evaluate([test_case])
        results[name] = result.eval_results[0].metrics[0].value
    
    for name, score in results.items():
        print(f"{name}: {score:.4f}")
    

test_bleu_configurations()

### ROUGE

In [None]:
from fi.evals.metrics import ROUGEScore
from fi.testcases import TestCase

def test_rouge_configurations():
    test_text = "The quick brown fox jumps over the lazy dog"
    reference = "The fast brown fox leaps over the sleepy dog"
    
    # 1. Default configuration (rouge1)
    default_rouge = ROUGEScore()
    
    # 2. Rouge2 (bigram matching)
    rouge2 = ROUGEScore(config={
        "rouge_type": "rouge2",
        "use_stemmer": True
    })
    
    # 3. RougeL (longest common subsequence)
    rougeL = ROUGEScore(config={
        "rouge_type": "rougeL",
        "use_stemmer": True
    })
    
    # 4. Rouge1 without stemming
    rouge1_no_stem = ROUGEScore(config={
        "rouge_type": "rouge1",
        "use_stemmer": False
    })
    
    test_case = TestCase(
        response=test_text,
        expected_text=reference
    )
    
    configs = {
        "Default (Rouge1)": default_rouge,
        "Rouge2": rouge2,
        "RougeL": rougeL,
        "Rouge1 No Stemming": rouge1_no_stem
    }
    
    results = {}
    for name, evaluator in configs.items():
        result = evaluator.evaluate([test_case])
        scores = {
            "precision": result.eval_results[0].metrics[0].value,
            "recall": result.eval_results[0].metrics[1].value,
            "fmeasure": result.eval_results[0].metrics[2].value
        }
        results[name] = scores
    
    for name, scores in results.items():
        print(f"\n{name}:")
        print(f"  Precision: {scores['precision']:.4f}")
        print(f"  Recall: {scores['recall']:.4f}")
        print(f"  F-measure: {scores['fmeasure']:.4f}")
    


test_rouge_configurations()

### Embedding Similarity

In [None]:
from fi.evals.metrics import EmbeddingSimilarity
from fi.testcases import TestCase


def test_embedding_similarity_configurations():
    test_text = "The quick brown fox jumps over the lazy dog"
    reference = "The fast brown fox leaps over the sleepy dog"
    
    # 1. Default configuration (cosine similarity)
    default_embedding = EmbeddingSimilarity()
    
    # 2. Euclidean distance
    euclidean_embedding = EmbeddingSimilarity(config={
        "similarity_method": "euclidean",
        "normalize": True
    })
    
    # 3. Manhattan distance
    manhattan_embedding = EmbeddingSimilarity(config={
        "similarity_method": "manhattan",
        "normalize": True
    })
    
    # 4. Cosine without normalization
    cosine_no_norm = EmbeddingSimilarity(config={
        "similarity_method": "cosine",
        "normalize": False
    })
    
    
    test_case = TestCase(
        response=test_text,
        expected_text=reference
    )
    
    configs = {
        "Default (Cosine)": default_embedding,
        "Euclidean Distance": euclidean_embedding,
        "Manhattan Distance": manhattan_embedding,
        "Cosine No Normalization": cosine_no_norm
    }
    
    results = {}
    for name, evaluator in configs.items():
        result = evaluator.evaluate([test_case])
        results[name] = result.eval_results[0].metrics[0].value
    
    for name, score in results.items():
        print(f"{name}: {score:.4f}")
    

test_embedding_similarity_configurations()

### Lavenshtein Distance

In [None]:
from fi.evals.metrics import LevenshteinDistance
from fi.testcases import TestCase

def test_levenshtein_input_types():
    # 1. Default configuration (case-sensitive, with punctuation)
    default_levenshtein = LevenshteinDistance()
    
    # 2. Case-insensitive comparison
    case_insensitive = LevenshteinDistance(config={
        "case_insensitive": True
    })
    
    # 3. Punctuation removal
    no_punctuation = LevenshteinDistance(config={
        "remove_punctuation": True
    })
    
    # 4. Both case-insensitive and no punctuation
    both_options = LevenshteinDistance(config={
        "case_insensitive": True,
        "remove_punctuation": True
    })
    
    #various test cases
    
    # A. Standard text comparison
    standard_case = TestCase(
        response="The quick brown fox jumps over the lazy dog.",
        expected_text="The quick brown fox jumps over the lazy dog."
    )
    
    # B. Case difference
    case_difference = TestCase(
        response="The Quick Brown Fox Jumps Over The Lazy Dog.",
        expected_text="the quick brown fox jumps over the lazy dog."
    )
    
    # C. Punctuation difference
    punctuation_difference = TestCase(
        response="The quick brown fox jumps over the lazy dog!",
        expected_text="The quick brown fox jumps over the lazy dog."
    )
    
    # D. Both case and punctuation differences
    both_differences = TestCase(
        response="The Quick Brown Fox Jumps Over the Lazy Dog!",
        expected_text="the quick brown fox jumps over the lazy dog."
    )
    
    # E. Word order difference (which Levenshtein will penalize)
    word_order = TestCase(
        response="The dog lazy the over jumps fox brown quick.",
        expected_text="The quick brown fox jumps over the lazy dog."
    )
    
    # F. Slightly different text
    slight_difference = TestCase(
        response="The quick brown fox leaps over the sleeping dog.",
        expected_text="The quick brown fox jumps over the lazy dog."
    )
    
    # G. Special characters
    special_chars = TestCase(
        response="The quick brown fox — it jumps over the lazy dog!",
        expected_text="The quick brown fox (jumps) over the lazy dog."
    )
    
    # H. Numeric content
    numeric_content = TestCase(
        response="The 5 quick foxes jump over 2 lazy dogs.",
        expected_text="The five quick foxes jump over two lazy dogs."
    )
    
    # I. Empty strings
    empty_response = TestCase(
        response="",
        expected_text="The quick brown fox jumps over the lazy dog."
    )
    
    test_cases = {
        "Standard": standard_case,
        "Case Difference": case_difference,
        "Punctuation Difference": punctuation_difference,
        "Case & Punctuation": both_differences,
        "Word Order": word_order,
        "Slight Difference": slight_difference,
        "Special Characters": special_chars,
        "Numeric Content": numeric_content,
        "Empty Response": empty_response
    }
    
    evaluators = {
        "Default": default_levenshtein,
        "Case-Insensitive": case_insensitive,
        "No Punctuation": no_punctuation,
        "Case-Ins & No Punct": both_options
    }
    
    results = {}
    
    for eval_name, evaluator in evaluators.items():
        results[eval_name] = {}
        for case_name, test_case in test_cases.items():
            result = evaluator.evaluate([test_case])
            results[eval_name][case_name] = result.eval_results[0].metrics[0].value
    
    for eval_name, cases in results.items():
        print(f"\n{eval_name} Configuration:")
        print("-" * 40)
        for case_name, score in cases.items():
            print(f"{case_name:20s}: {score:.4f}")
    

test_levenshtein_input_types()

### Numeric Diff Eval

In [None]:
from fi.evals.metrics import NumericDiff
from fi.testcases import TestCase

def test_numeric_diff_comparison():
    normalized_diff = NumericDiff(config={
        "extract_numeric": True,
        "normalized_result": True
    })
    
    absolute_diff = NumericDiff(config={
        "extract_numeric": True,
        "normalized_result": False
    })
    
    test_cases = [
        # Basic cases
        TestCase(response="100", expected_text="100"),
        TestCase(response="50", expected_text="100"),
        TestCase(response="150", expected_text="100"),
        
        # Text with numbers
        TestCase(response="The price is 99.5 dollars", expected_text="100"),
        TestCase(response="The measurement is 1.5 meters", expected_text="The expected measurement was 1 meter"),
        
        # Small and large numbers
        TestCase(response="0.001", expected_text="0.002"),
        TestCase(response="1000000", expected_text="2000000"),
        
        # Zero reference
        TestCase(response="5", expected_text="0"),
        
        # Negative numbers
        TestCase(response="-10", expected_text="-8")
    ]
    
    # Run evaluations and show results
    print("Numeric Difference Comparison:")
    print("-" * 60)
    print(f"{'Test Case':30s} | {'Normalized':10s} | {'Absolute':10s}")
    print("-" * 60)
    
    for tc in test_cases:
        norm_result = normalized_diff.evaluate([tc])
        abs_result = absolute_diff.evaluate([tc])
        
        norm_value = norm_result.eval_results[0].metrics[0].value
        abs_value = abs_result.eval_results[0].metrics[0].value
        
        case_desc = f"{tc.response}, {tc.expected_text}"
        if len(case_desc) > 28:
            case_desc = case_desc[:25] + "..."
            
        print(f"{case_desc:30s} | {norm_value:10.4f} | {abs_value:10.2f}")
    

test_numeric_diff_comparison()

### Semantic List Contains

In [None]:
from fi.evals.metrics import SemanticListContains
from fi.testcases import TestCase
import json

# 1. Single string expected_text
single_string = TestCase(
    response="The quick brown fox jumps over the lazy dog",
    expected_text="fox"
)

# 2. JSON-encoded list of strings
json_list = TestCase(
    response="The quick brown fox jumps over the lazy dog",
    expected_text=json.dumps(["brown fox", "lazy dog"])
)

# 3. JSON-encoded list with mixed matches
partial_match = TestCase(
    response="The quick brown fox jumps over the lazy dog",
    expected_text=json.dumps(["brown fox", "lazy dog", "flying elephant"])
)

# 4. No matching phrases
no_match = TestCase(
    response="The quick brown fox jumps over the lazy dog",
    expected_text=json.dumps(["flying elephant", "dancing giraffe"])
)

# Create evaluator with default configuration
evaluator = SemanticListContains(
    config={
        "similarity_threshold": 0.5
    }
)

# Test with different inputs
for i, test_case in enumerate([single_string, json_list, partial_match, no_match], 1):
    result = evaluator.evaluate([test_case])
    score = result.eval_results[0].metrics[0].value
    
    metadata = result.eval_results[0].metadata
    
    print(f"Response: {test_case.response}")
    print(f"Expected Keywords: {test_case.expected_text}")
    print(score)
    print(metadata)

    print("-" * 50)

### Aggregated Metric


In [None]:
from fi.evals.metrics import BLEUScore, ROUGEScore, LevenshteinDistance, AggregatedMetric
from fi.testcases import TestCase

# Create a standard test case
test_case = TestCase(
    response="The quick brown fox jumps over the lazy dog.",
    expected_text="The quick brown fox jumps over the lazy dog."
)

# Create individual metrics
bleu = BLEUScore()
rouge = ROUGEScore(config={"rouge_type": "rouge1"})
levenshtein = LevenshteinDistance()

# Example 1: BLEU and ROUGE with simple average
avg_metric = AggregatedMetric(config={
    "metrics": [bleu, rouge],
    "metric_names": ["bleu", "rouge1"],
    "aggregator": "average"
})

# Example 2: BLEU and ROUGE with weighted average (70% BLEU, 30% ROUGE)
weighted_metric = AggregatedMetric(config={
    "metrics": [bleu, rouge],
    "metric_names": ["bleu", "rouge1"],
    "aggregator": "weighted_average",
    "weights": [0.7, 0.3]
})

# Example 3: Combining BLEU, ROUGE and Levenshtein with average
combined_metric = AggregatedMetric(config={
    "metrics": [bleu, rouge, levenshtein],
    "metric_names": ["bleu", "rouge1", "levenshtein"],
    "aggregator": "average"
})

print("BLEU + ROUGE with Simple Average")
result = avg_metric.evaluate([test_case])
score = result.eval_results[0].metrics[0].value
metrics = result.eval_results[0].metadata

print(f"Aggregated Score: {score:.4f}")
print("--------------------------------")

print("\nBLEU + ROUGE with Weighted Average (70% BLEU, 30% ROUGE)")
result = weighted_metric.evaluate([test_case])
score = result.eval_results[0].metrics[0].value
metrics = result.eval_results[0].metadata

print(f"Aggregated Score: {score:.4f}")
print("--------------------------------")

print("\nBLEU + ROUGE + Levenshtein with Average")
result = combined_metric.evaluate([test_case])
score = result.eval_results[0].metrics[0].value
metrics = result.eval_results[0].metadata

print(f"Aggregated Score: {score:.4f}")
print("--------------------------------")
