# Generate Test Cases from Data Analysis

This notebook generates test cases for embedding evaluation based on analysis of your dataset patterns.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
import yaml
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.notebook import tqdm

# Configure pandas display
pd.set_option('display.max_colwidth', 100)

## 1. Load Configuration and Data

In [None]:
def load_config_and_data():
    """Load configuration and dataset."""
    # Load config
    config_path = Path("./config/embedding_eval.yaml")
    with open(config_path) as f:
        config = yaml.safe_load(f)
    
    # Get column names
    content_col = config['dataset']['columns']['content']
    identifier_col = config['dataset']['columns']['identifier']
    metadata_col = config['dataset']['columns']['metadata']
    
    # Load dataset
    data_path = Path(config['dataset']['path'])
    df = pd.read_csv(data_path)
    
    print(f"Loaded {len(df)} rows with columns: {df.columns.tolist()}")
    return config, df, content_col, identifier_col, metadata_col

# Load data
config, df, content_col, identifier_col, metadata_col = load_config_and_data()

# Display sample
display(df.head())

Loaded 12340 rows with columns: ['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast', 'Genre', 'Wiki Page', 'Plot', 'PlotSummary']


Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,PlotSummary
0,1950,8 Ball Bunny,American,Chuck Jones,Looney Tunes,animation,https://en.wikipedia.org/wiki/8_Ball_Bunny,The Brooklyn Ice Palace shuts down after the Ice Frolics pack up to go to another show somewhere...,"""Playboy"" Penguin is found by Bugs Bunny, who vows to take him home. After ten days at sea, Bugs..."
1,1950,711 Ocean Drive,American,Joseph M. Newman,"Edmond O'Brien, Joanne Dru, Dorothy Patrick",crime drama,https://en.wikipedia.org/wiki/711_Ocean_Drive,"Knowing how much telephone repairman Mal Granger (Edmond O'Brien) likes to bet on the horses, sm...",Mal Granger (Edmond O'Brien) is a telephone repairman who likes to bet on the horses. He takes c...
2,1950,Abbott and Costello in the Foreign Legion,American,Charles Lamont,"Abbott and Costello, Patricia Medina",comedy,https://en.wikipedia.org/wiki/Abbott_and_Costello_in_the_Foreign_Legion,"Bud Jones (Bud Abbott) and Lou Hotchkiss (Lou Costello) are wrestling promoters. Their star, Abd...","Bud Jones (Bud Abbott) and Lou Hotchkiss (Lou Costello) are wrestling promoters. Their star, Abd..."
3,1950,The Admiral Was a Lady,American,Albert S. Rogell,"Edmond O'Brien, Wanda Hendrix, Steve Brodie",comedy,https://en.wikipedia.org/wiki/The_Admiral_Was_a_Lady,"After the end of World War II, Jean Madison (Wanda Hendrix), a former WAVE ensign, meets the for...","Jean Madison, a former WAVE ensign, meets the former aircrew of an Army Air Corps A-20 Havoc lig..."
4,1950,All About Eve,American,Joseph L. Mankiewicz,"Bette Davis, Anne Baxter, Gary Merrill, Celeste Holm, George Sanders, Hugh Marlowe, Marilyn Monroe",drama,https://en.wikipedia.org/wiki/All_About_Eve,Margo Channing (Bette Davis) is one of the biggest stars on Broadway. But having just turned for...,Margo Channing (Bette Davis) is one of the biggest stars on Broadway. She is worried about what ...


## 2. Analyze Content Distribution

In [None]:
def analyze_content_distribution(df, content_col):
    """Analyze content length distribution."""
    length_stats = df[content_col].str.len().describe()
    length_ranges = {
        "short": length_stats['25%'],
        "medium": length_stats['50%'],
        "long": length_stats['75%']
    }
    
    print("Content Length Statistics:")
    print(length_stats)
    print("\nLength Categories:")
    for category, threshold in length_ranges.items():
        print(f"{category}: ≤{threshold:.0f} characters")
    
    return length_ranges

length_ranges = analyze_content_distribution(df, content_col)

Content Length Statistics:
count    12340.000000
mean       407.403728
std         98.357830
min        194.000000
25%        329.000000
50%        396.000000
75%        485.000000
max        697.000000
Name: PlotSummary, dtype: float64

Length Categories:
short: ≤329 characters
medium: ≤396 characters
long: ≤485 characters


## 3. Find Similar Content Pairs

In [None]:

def find_similar_pairs(df, content_col, n_pairs=5):
    """Find pairs of content with different similarity levels."""
    print("Vectorizing content...")
    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(df[content_col])
    
    print("Calculating similarities...")
    # Initialize pairs dictionary
    pairs = {
        'high': [],    # similarity > 0.8
        'medium': [],  # similarity 0.4-0.8
        'low': []      # similarity < 0.4
    }
    
    # Calculate total iterations for progress bar
    n_docs = len(df)
    total_comparisons = (n_docs * (n_docs - 1)) // 2
    
    # Sample pairs at each similarity level
    with tqdm(total=total_comparisons, desc="Finding similar pairs") as pbar:
        for i in range(n_docs):
            # Calculate similarities for current row
            similarities = cosine_similarity(vectors[i:i+1], vectors[i+1:]).flatten()
            
            # Process similarities
            for j, sim in enumerate(similarities):
                actual_j = i + 1 + j
                pair = (df[content_col].iloc[i], df[content_col].iloc[actual_j], sim)
                
                if sim > 0.8 and len(pairs['high']) < n_pairs:
                    pairs['high'].append(pair)
                elif 0.4 <= sim <= 0.8 and len(pairs['medium']) < n_pairs:
                    pairs['medium'].append(pair)
                elif sim < 0.4 and len(pairs['low']) < n_pairs:
                    pairs['low'].append(pair)
                
            pbar.update(n_docs - (i + 1))
            
            # Early stopping if we have enough pairs
            if all(len(pairs[level]) >= n_pairs for level in pairs):
                pbar.update(total_comparisons - pbar.n)  # Update progress bar to completion
                break
    
    # Print sample pairs
    print("\nContent Pairs by Similarity Level:")
    for level, level_pairs in pairs.items():
        print(f"\n{level.upper()} SIMILARITY:")
        for content1, content2, sim in level_pairs[:2]:  # Show first 2 pairs
            print(f"Similarity: {sim:.2f}")
            print(f"Content 1: {content1[:100]}...")
            print(f"Content 2: {content2[:100]}...\n")
                
    return pairs

content_pairs = find_similar_pairs(df, content_col)

Vectorizing content...
Calculating similarities...


Finding similar pairs:   0%|          | 0/76131630 [00:00<?, ?it/s]


Content Pairs by Similarity Level:

HIGH SIMILARITY:
Similarity: 1.00
Content 1: Captain Phillip Donlin (Lloyd Bridges) and his small troop must rush to reach Little Big Horn in ord...
Content 2: Captain Phillip Donlin (Lloyd Bridges) and his small troop must rush to reach Little Big Horn in ord...

Similarity: 1.00
Content 1: Some-time piano player Richard Kincaid (James Anderson) was brought to trial for murder 12 years ear...
Content 2: Some-time piano player Richard Kincaid (James Anderson) was brought to trial for murder 12 years ear...


MEDIUM SIMILARITY:
Similarity: 0.41
Content 1: Daffy is enjoying his "daffy" nature around the forest. Porky is "saved" by Pocahontas (Daffy yet ag...
Content 2: Porky Pig receives a grand prize from the radio station. Daffy Duck insists on living in Porky's hou...

Similarity: 0.43
Content 1: Daffy is enjoying his "daffy" nature around the forest. Porky is "saved" by Pocahontas (Daffy yet ag...
Content 2: Daffy Duck and Porky Pig work in the ho

## 4. Generate Test Cases

In [None]:
def generate_test_cases(df, content_col, identifier_col, metadata_col, config, length_ranges, content_pairs):
    """Generate test cases based on data patterns."""
    test_cases = {
        "version": "1.0",
        "description": "Automatically generated test cases based on data patterns",
        "test_requirements": {
            "min_queries_per_category": config['dataset']['sampling'].get('min_per_category', 5),
            "max_query_content_ratio": 0.1,
            "min_relevant_per_query": 5
        },
        "similarity_distribution": {
            "high": 0.3,
            "medium": 0.4,
            "low": 0.3
        },
        "cases": []
    }
    
    # Add length-based test cases
    for length_type, threshold in length_ranges.items():
        subset = df[df[content_col].str.len() <= threshold]
        if len(subset) < 10:
            # If not enough data, pick them all, or skip if you prefer
            sampled_rows = subset
        else:
            sampled_rows = subset.sample(n=10)  # pick 10

        for idx, row in sampled_rows.iterrows():
            test_cases['cases'].append({
                "name": f"{length_type}_length_test_{idx}",
                "description": f"Testing {length_type} length content (≤{threshold:.0f} chars)",
                "query": f"Find similar to: {row[identifier_col]}",
                "content": row[content_col],
                "metadata": row[metadata_col],
                "expected_relevance": 1.0,
                "test_metadata": {
                    "category": "length_based",
                    "length_type": length_type,
                    "char_count": len(row[content_col])
                }
            })
    
    # Add metadata-based test cases
    metadata_groups = df[metadata_col].value_counts()
    for metadata_value in metadata_groups.head(5).index:
        samples = df[df[metadata_col] == metadata_value].sample(n=10)
        test_cases['cases'].append({
            "name": f"metadata_{metadata_value}_test",
            "description": f"Testing content with metadata: {metadata_value}",
            "query": samples.iloc[0][content_col],
            "content": samples.iloc[1][content_col],
            "metadata": metadata_value,
            "expected_relevance": 1.0,
            "test_metadata": {
                "category": "metadata_based",
                "metadata_value": metadata_value,
                "frequency": float(metadata_groups[metadata_value] / len(df))
            }
        })
    
    # Add similarity-based test cases
    for level, pairs in content_pairs.items():
        for i, (content1, content2, sim) in enumerate(pairs):
            test_cases['cases'].append({
                "name": f"{level}_similarity_test_{i}",
                "description": f"Testing {level} similarity matching",
                "query": content1,
                "content": content2,
                "metadata": df[df[content_col] == content2].iloc[0][metadata_col],
                "expected_relevance": sim,
                "test_metadata": {
                    "category": "similarity_based",
                    "similarity_level": level,
                    "measured_similarity": float(sim)
                }
            })
    
    return test_cases

test_cases = generate_test_cases(
    df, content_col, identifier_col, metadata_col, 
    config, length_ranges, content_pairs
)

## 5. Validate and Save Test Cases

In [None]:
def save_and_validate_test_cases(test_cases, output_path):
    """Save test cases and validate coverage."""
    # Save test cases
    with open(output_path, 'w') as f:
        json.dump(test_cases, f, indent=2)
    
    # Analyze distribution
    case_types = pd.Series([case['test_metadata']['category'] 
                           for case in test_cases['cases']])
    print(f"Generated {len(test_cases['cases'])} test cases")
    
    print("\nDistribution by category:")
    display(case_types.value_counts())
    
    # Analyze similarity distribution
    similarity_cases = [case for case in test_cases['cases'] 
                       if case['test_metadata']['category'] == 'similarity_based']
    sim_distribution = pd.Series([case['test_metadata']['similarity_level'] 
                                 for case in similarity_cases])
    
    print("\nSimilarity distribution:")
    display(sim_distribution.value_counts(normalize=True))

# Save and validate
output_path = Path("./config/test_cases.json")
save_and_validate_test_cases(test_cases, output_path)

Generated 50 test cases

Distribution by category:


length_based        30
similarity_based    15
metadata_based       5
Name: count, dtype: int64


Similarity distribution:


high      0.333333
medium    0.333333
low       0.333333
Name: proportion, dtype: float64