# Cross-Dataset Comparison

Comparing model performance across different datasets to understand generalization.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="darkgrid")
%matplotlib inline

## Summary

| Dataset | Total Files | Accuracy | Precision | Recall | F1 |
|---------|-------------|----------|-----------|--------|----|
| Original Eval | ? | 99.7% | - | - | - |
| In-The-Wild | ~19k+ | TBD | TBD | TBD | TBD |
| Fake-or-Real | ~69k | TBD | TBD | TBD | TBD |

**Run both evaluation notebooks first** to populate the results.

In [None]:
# Load results from evaluation notebooks (run them first!)
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

results = []

# In-The-Wild results (if available)
itw_path = "../data/results.csv"  # from notebook 01
if os.path.exists(itw_path):
    itw_df = pd.read_csv(itw_path)
    results.append({
        "dataset": "In-The-Wild",
        "files": len(itw_df),
        "accuracy": accuracy_score(itw_df["label"], itw_df["pred"]) * 100,
        "precision": precision_score(itw_df["label"], itw_df["pred"], pos_label="spoof") * 100,
        "recall": recall_score(itw_df["label"], itw_df["pred"], pos_label="spoof") * 100,
        "f1": f1_score(itw_df["label"], itw_df["pred"], pos_label="spoof") * 100
    })

# Fake-or-Real results (if available)
for_path = "../data/results_fake_or_real.csv"  # from notebook 02
if os.path.exists(for_path):
    for_df = pd.read_csv(for_path)
    results.append({
        "dataset": "Fake-or-Real",
        "files": len(for_df),
        "accuracy": accuracy_score(for_df["true"], for_df["pred"]) * 100,
        "precision": precision_score(for_df["true"], for_df["pred"], pos_label="fake") * 100,
        "recall": recall_score(for_df["true"], for_df["pred"], pos_label="fake") * 100,
        "f1": f1_score(for_df["true"], for_df["pred"], pos_label="fake") * 100
    })

# Add original eval baseline
results.append({
    "dataset": "Original Eval (reported)",
    "files": None,
    "accuracy": 99.7,
    "precision": None,
    "recall": None,
    "f1": None
})

comparison_df = pd.DataFrame(results)
comparison_df

In [None]:
# Visualization comparing datasets
if len(comparison_df[comparison_df["accuracy"].notna()]) > 1:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Accuracy comparison
    data = comparison_df[comparison_df["accuracy"].notna()]
    colors = ["green" if acc > 70 else "orange" if acc > 50 else "red" for acc in data["accuracy"]]
    axes[0].barh(data["dataset"], data["accuracy"], color=colors)
    axes[0].axvline(50, color="gray", linestyle="--", alpha=0.5, label="Random")
    axes[0].set_xlabel("Accuracy (%)")
    axes[0].set_title("Model Accuracy Across Datasets")
    axes[0].set_xlim(0, 100)
    
    # F1 Score comparison (excluding Original Eval)
    f1_data = comparison_df[comparison_df["f1"].notna()]
    if len(f1_data) > 0:
        axes[1].barh(f1_data["dataset"], f1_data["f1"], color="steelblue")
        axes[1].set_xlabel("F1 Score (%)")
        axes[1].set_title("F1 Score Across Datasets")
        axes[1].set_xlim(0, 100)
    
    plt.tight_layout()
    plt.show()
else:
    print("Run notebooks 01 and 02 first to generate comparison data!")

## Conclusions

### Key Questions Answered:

1. **Does the model generalize poorly in general, or is In-The-Wild particularly challenging?**
   - The model likely struggles with both datasets, indicating a general generalization problem

2. **What characteristics make a dataset harder for the model?**
   - Different audio sources (YouTube vs TTS engines)
   - Different recording conditions and quality
   - Different deepfake generation methods than training data

3. **Is the issue with the model architecture or training data?**
   - Most likely **training data** - the model achieves 99.7% on its original eval set but fails on new data
   - This is classic **overfitting to the training distribution**

### Recommendations:
- Consider fine-tuning on diverse datasets
- Use ensemble methods with multiple models
- Evaluate on more diverse test sets before deployment