# Strategy Analysis for Outputs Folder

This notebook automatically analyzes all strategies and runs in the `outputs` folder. It aggregates and visualizes metrics (WER, CER, SIM) for each strategy, run, and sentence grouping.

- **Flexible:** Handles any number of strategies, runs, samples, and sentence groupings.
- **Metrics:** WER, CER, SIM from `evaluation_metrics.csv`.
- **Visualizations:** Line plots, bar plots, and more using matplotlib.


In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
import re

# Set the outputs directory
outputs_dir = 'outputs'

# Discover all strategies/runs
strategies = [d for d in os.listdir(outputs_dir) if os.path.isdir(os.path.join(outputs_dir, d))]
print(f'Found strategies/runs: {strategies}')


In [None]:
# Data structure: strategy -> sentence_count -> metric_type -> list of values
data = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

# Parse strategy names to extract strategy and run number
def parse_strategy_name(strategy_name):
    """Parse strategy name like '414_data_driven_run1' into strategy and run"""
    parts = strategy_name.split('_')
    if len(parts) >= 3 and parts[-1].startswith('run'):
        run_num = parts[-1]
        strategy = '_'.join(parts[:-1])
        return strategy, run_num
    else:
        return strategy_name, 'run1'

# Collect all data
for strategy_run in strategies:
    strategy_path = os.path.join(outputs_dir, strategy_run)
    strategy, run = parse_strategy_name(strategy_run)
    
    # Each strategy/run has multiple samples
    if not os.path.isdir(strategy_path):
        continue
    
    samples = [s for s in os.listdir(strategy_path) if os.path.isdir(os.path.join(strategy_path, s))]
    
    for sample in samples:
        sample_path = os.path.join(strategy_path, sample)
        # Each sample has folders like '2_sentences', '4_sentences', etc.
        sentence_folders = [f for f in os.listdir(sample_path) if os.path.isdir(os.path.join(sample_path, f)) and '_sentences' in f]
        
        for sent_folder in sentence_folders:
            sent_path = os.path.join(sample_path, sent_folder)
            eval_file = os.path.join(sent_path, 'evaluation_metrics.csv')
            
            if os.path.exists(eval_file):
                try:
                    df = pd.read_csv(eval_file)
                    # Extract number of sentences from folder name
                    num_sent = int(sent_folder.split('_')[0])
                    
                    for _, row in df.iterrows():
                        metric_type = row['Type']
                        data[f'{strategy}_{run}'][num_sent][f'{metric_type}_WER'].append(row['WER'])
                        data[f'{strategy}_{run}'][num_sent][f'{metric_type}_CER'].append(row['CER'])
                        data[f'{strategy}_{run}'][num_sent][f'{metric_type}_SIM'].append(row['SIM'])
                        
                except Exception as e:
                    print(f'Error reading {eval_file}: {e}')

print(f"\\nData collected for {len(data)} strategy/run combinations")
for strategy_run in data:
    sentence_counts = sorted(data[strategy_run].keys())
    print(f"{strategy_run}: sentence counts {sentence_counts}")


In [None]:
# Plot metrics vs number of sentences for each strategy/run
plt.figure(figsize=(15, 12))

# Define colors for different metrics
colors = {'Raw ASR_WER': 'red', 'Improved_WER': 'darkred',
          'Raw ASR_CER': 'blue', 'Improved_CER': 'darkblue', 
          'Raw ASR_SIM': 'green', 'Improved_SIM': 'darkgreen'}

# Create subplots for each metric type
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

metrics = ['Raw ASR_WER', 'Improved_WER', 'Raw ASR_CER', 'Improved_CER', 'Raw ASR_SIM', 'Improved_SIM']

for i, metric in enumerate(metrics):
    ax = axes[i]
    
    for strategy_run in data:
        num_sents = sorted(data[strategy_run].keys())
        means = []
        stds = []
        
        for n in num_sents:
            values = data[strategy_run][n].get(metric, [])
            if values:
                means.append(np.mean(values))
                stds.append(np.std(values))
            else:
                means.append(np.nan)
                stds.append(np.nan)
        
        # Filter out NaN values for plotting
        valid_indices = ~np.isnan(means)
        if np.any(valid_indices):
            valid_sents = np.array(num_sents)[valid_indices]
            valid_means = np.array(means)[valid_indices]
            valid_stds = np.array(stds)[valid_indices]
            
            ax.errorbar(valid_sents, valid_means, yerr=valid_stds, 
                       label=strategy_run, marker='o', linestyle='-', linewidth=2)
    
    ax.set_title(f'{metric} vs Number of Sentences', fontsize=12, fontweight='bold')
    ax.set_xlabel('Number of Sentences')
    ax.set_ylabel(metric)
    ax.grid(True, alpha=0.3)
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()


In [None]:
sentence_count_to_compare = 6  # Change as needed

# Get all available sentence counts across all strategies
all_sentence_counts = set()
for strategy_run in data:
    all_sentence_counts.update(data[strategy_run].keys())
all_sentence_counts = sorted(all_sentence_counts)

print(f"Available sentence counts: {all_sentence_counts}")
print(f"Comparing strategies for {sentence_count_to_compare} sentences")

# Create bar plots comparing strategies for the chosen sentence count
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

metrics = ['Raw ASR_WER', 'Improved_WER', 'Raw ASR_CER', 'Improved_CER', 'Raw ASR_SIM', 'Improved_SIM']

for i, metric in enumerate(metrics):
    ax = axes[i]
    
    strategy_names = []
    means = []
    stds = []
    
    for strategy_run in data:
        if sentence_count_to_compare in data[strategy_run]:
            values = data[strategy_run][sentence_count_to_compare].get(metric, [])
            if values:
                strategy_names.append(strategy_run)
                means.append(np.mean(values))
                stds.append(np.std(values))
    
    if strategy_names:
        bars = ax.bar(range(len(strategy_names)), means, yerr=stds, 
                     capsize=5, alpha=0.7, color=colors.get(metric, 'gray'))
        ax.set_title(f'{metric} for {sentence_count_to_compare} Sentences', fontsize=12, fontweight='bold')
        ax.set_ylabel(metric)
        ax.set_xlabel('Strategy/Run')
        ax.set_xticks(range(len(strategy_names)))
        ax.set_xticklabels(strategy_names, rotation=45, ha='right')
        ax.grid(True, alpha=0.3, axis='y')
        
        # Add value labels on bars
        for bar, mean in zip(bars, means):
            ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + (max(means) * 0.01), 
                   f'{mean:.3f}', ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()


In [None]:
# Calculate improvement percentages
improvement_data = defaultdict(lambda: defaultdict(dict))

for strategy_run in data:
    for sentence_count in data[strategy_run]:
        # WER improvement (lower is better, so improvement is negative change)
        raw_wer = data[strategy_run][sentence_count].get('Raw ASR_WER', [])
        imp_wer = data[strategy_run][sentence_count].get('Improved_WER', [])
        if raw_wer and imp_wer:
            wer_improvement = ((np.mean(raw_wer) - np.mean(imp_wer)) / np.mean(raw_wer)) * 100
            improvement_data[strategy_run][sentence_count]['WER_improvement'] = wer_improvement
        
        # CER improvement (lower is better, so improvement is negative change)
        raw_cer = data[strategy_run][sentence_count].get('Raw ASR_CER', [])
        imp_cer = data[strategy_run][sentence_count].get('Improved_CER', [])
        if raw_cer and imp_cer:
            cer_improvement = ((np.mean(raw_cer) - np.mean(imp_cer)) / np.mean(raw_cer)) * 100
            improvement_data[strategy_run][sentence_count]['CER_improvement'] = cer_improvement
        
        # SIM improvement (higher is better, so improvement is positive change)
        raw_sim = data[strategy_run][sentence_count].get('Raw ASR_SIM', [])
        imp_sim = data[strategy_run][sentence_count].get('Improved_SIM', [])
        if raw_sim and imp_sim:
            sim_improvement = ((np.mean(imp_sim) - np.mean(raw_sim)) / np.mean(raw_sim)) * 100
            improvement_data[strategy_run][sentence_count]['SIM_improvement'] = sim_improvement

# Plot improvement percentages
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

improvement_metrics = ['WER_improvement', 'CER_improvement', 'SIM_improvement']
titles = ['WER Improvement (%)', 'CER Improvement (%)', 'SIM Improvement (%)']

for i, (metric, title) in enumerate(zip(improvement_metrics, titles)):
    ax = axes[i]
    
    for strategy_run in improvement_data:
        sentence_counts = sorted(improvement_data[strategy_run].keys())
        improvements = []
        
        for sc in sentence_counts:
            imp = improvement_data[strategy_run][sc].get(metric, np.nan)
            improvements.append(imp)
        
        # Filter out NaN values
        valid_indices = ~np.isnan(improvements)
        if np.any(valid_indices):
            valid_counts = np.array(sentence_counts)[valid_indices]
            valid_improvements = np.array(improvements)[valid_indices]
            
            ax.plot(valid_counts, valid_improvements, marker='o', linewidth=2, 
                   label=strategy_run, markersize=6)
    
    ax.set_title(title, fontsize=14, fontweight='bold')
    ax.set_xlabel('Number of Sentences')
    ax.set_ylabel('Improvement (%)')
    ax.grid(True, alpha=0.3)
    ax.legend()
    ax.axhline(y=0, color='black', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()


In [None]:
# Create summary statistics
summary_stats = []

for strategy_run in data:
    all_raw_wer = []
    all_imp_wer = []
    all_raw_cer = []
    all_imp_cer = []
    all_raw_sim = []
    all_imp_sim = []
    
    for sentence_count in data[strategy_run]:
        all_raw_wer.extend(data[strategy_run][sentence_count].get('Raw ASR_WER', []))
        all_imp_wer.extend(data[strategy_run][sentence_count].get('Improved_WER', []))
        all_raw_cer.extend(data[strategy_run][sentence_count].get('Raw ASR_CER', []))
        all_imp_cer.extend(data[strategy_run][sentence_count].get('Improved_CER', []))
        all_raw_sim.extend(data[strategy_run][sentence_count].get('Raw ASR_SIM', []))
        all_imp_sim.extend(data[strategy_run][sentence_count].get('Improved_SIM', []))
    
    summary_stats.append({
        'Strategy/Run': strategy_run,
        'Avg Raw WER': np.mean(all_raw_wer) if all_raw_wer else np.nan,
        'Avg Improved WER': np.mean(all_imp_wer) if all_imp_wer else np.nan,
        'Avg Raw CER': np.mean(all_raw_cer) if all_raw_cer else np.nan,
        'Avg Improved CER': np.mean(all_imp_cer) if all_imp_cer else np.nan,
        'Avg Raw SIM': np.mean(all_raw_sim) if all_raw_sim else np.nan,
        'Avg Improved SIM': np.mean(all_imp_sim) if all_imp_sim else np.nan,
        'WER Improvement %': ((np.mean(all_raw_wer) - np.mean(all_imp_wer)) / np.mean(all_raw_wer)) * 100 if all_raw_wer and all_imp_wer else np.nan,
        'CER Improvement %': ((np.mean(all_raw_cer) - np.mean(all_imp_cer)) / np.mean(all_raw_cer)) * 100 if all_raw_cer and all_imp_cer else np.nan,
        'SIM Improvement %': ((np.mean(all_imp_sim) - np.mean(all_raw_sim)) / np.mean(all_raw_sim)) * 100 if all_raw_sim and all_imp_sim else np.nan,
    })

summary_df = pd.DataFrame(summary_stats)
print("Summary Statistics Across All Sentence Counts:")
print("=" * 80)
print(summary_df.round(4))
