# Text-to-SQL Evaluation Results Analysis

This notebook analyzes the evaluation results from baseline and finetuned models.


In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add parent directory to path
sys.path.insert(0, str(Path().resolve().parent))

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


In [None]:
# Load results
results_dir = Path("../results")

# Find baseline and finetuned results
baseline_files = list(results_dir.glob("evaluation_*_baseline.json"))
finetuned_files = list(results_dir.glob("evaluation_*_finetuned.json"))

baseline_results = None
finetuned_results = None

if baseline_files:
    with open(sorted(baseline_files)[-1], 'r') as f:
        baseline_results = json.load(f)
    print("Baseline results loaded")
else:
    print("No baseline results found")

if finetuned_files:
    with open(sorted(finetuned_files)[-1], 'r') as f:
        finetuned_results = json.load(f)
    print("Finetuned results loaded")
else:
    print("No finetuned results found")


In [None]:
# Compare metrics
if baseline_results and finetuned_results:
    metrics_comparison = pd.DataFrame({
        'Baseline': baseline_results['metrics'],
        'Finetuned': finetuned_results['metrics']
    })
    
    metrics_comparison['Improvement'] = (
        metrics_comparison['Finetuned'] - metrics_comparison['Baseline']
    )
    
    print(metrics_comparison)
    
    # Plot comparison
    ax = metrics_comparison[['Baseline', 'Finetuned']].plot(kind='bar')
    plt.title('Model Performance Comparison')
    plt.ylabel('Accuracy')
    plt.xlabel('Metric')
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()
