# Baseline Models for Killer Prediction (Updated)

This notebook runs and analyzes baseline models using the refactored data models.

## Key Updates:
- Uses proper data models (Episode, Character, Sentence)
- Supports both character modes (episode-isolated and cross-episode)
- Consistent with refactored analysis modules

## Models Implemented:
1. **Frequency-based baselines**: Simple heuristics based on speaking patterns
2. **Traditional ML baselines**: Classic NLP approaches without deep learning

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML

# Add src to path
sys.path.append(str(Path('../src').resolve()))

# Import refactored baseline models
from analysis.baseline_models import BaselineModels

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("Setup complete!")

## 1. Initialize Baseline Models with Character Mode

In [None]:
# Choose character mode
CHARACTER_MODE = 'episode-isolated'  # or 'cross-episode'

# Initialize baseline models with proper data models
baselines = BaselineModels(
    data_dir=Path('../data/original'),
    character_mode=CHARACTER_MODE
)

print(f"Character mode: {CHARACTER_MODE}")
print(f"Loaded {len(baselines.episodes)} episodes")
print(f"Total unique characters: {baselines.summary_stats['unique_characters']}")
print(f"\nSample episodes:")

for i, episode in enumerate(baselines.episodes[:5]):
    char_data = baselines.get_episode_characters(episode.episode_id)
    killers = baselines.character_labels.get(episode.episode_id, set())
    print(f"  {episode.episode_id}: {len(episode.sentences)} sentences, "
          f"{len(char_data)} characters, {len(killers)} killer(s)")

## 2. Run Individual Baseline Models

In [None]:
# Frequency baseline
freq_results = baselines.frequency_baseline(verbose=True)

In [None]:
# Appearance order baseline
appearance_results = baselines.appearance_order_baseline(verbose=True)

In [None]:
# Bag-of-Words + Logistic Regression
bow_results = baselines.bow_logistic_regression(verbose=True)

In [None]:
# TF-IDF + SVM
tfidf_results = baselines.tfidf_svm(verbose=True)

In [None]:
# N-gram features
ngram_results = baselines.ngram_features_baseline(verbose=True)

In [None]:
# Combined features (statistical + text)
combined_results = baselines.combined_features_baseline(verbose=True)

## 3. Run All Baselines and Compare

In [None]:
# Run all baselines and get summary
results_df = baselines.run_all_baselines()

## 4. Compare Both Character Modes

In [None]:
# Compare results across both character modes
mode_results = {}

for mode in ['episode-isolated', 'cross-episode']:
    print(f"\nRunning baselines for {mode} mode...")
    mode_baselines = BaselineModels(
        data_dir=Path('../data/original'),
        character_mode=mode
    )
    mode_results[mode] = mode_baselines.run_all_baselines()
    print(f"Completed {mode} mode")

In [None]:
# Create comparison visualization
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

for idx, (mode, df) in enumerate(mode_results.items()):
    ax = axes[idx]
    
    # Plot F1 scores
    df.plot(x='Model', y='F1', kind='bar', ax=ax, color='steelblue')
    ax.set_title(f'F1 Scores - {mode} mode', fontsize=12, fontweight='bold')
    ax.set_ylabel('F1 Score')
    ax.set_xlabel('')
    ax.set_xticklabels(df['Model'], rotation=45, ha='right')
    ax.set_ylim(0, 1)
    ax.grid(True, alpha=0.3)
    
    # Add value labels
    for i, v in enumerate(df['F1']):
        ax.text(i, v + 0.01, f'{v:.3f}', ha='center', fontsize=8)

plt.suptitle('Baseline Performance: Episode-Isolated vs Cross-Episode', 
             fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Print comparison summary
print("\nBest models by mode:")
for mode, df in mode_results.items():
    best_idx = df['F1'].idxmax()
    print(f"  {mode}: {df.loc[best_idx, 'Model']} (F1={df.loc[best_idx, 'F1']:.3f})")

## 5. Visualize Detailed Results

In [None]:
# Focus on one mode for detailed visualization
df = results_df  # Use the initial results

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Accuracy comparison
ax = axes[0, 0]
df.plot(x='Model', y='Accuracy', kind='bar', ax=ax, color='steelblue')
ax.set_title('Model Accuracy Comparison', fontsize=14, fontweight='bold')
ax.set_ylabel('Accuracy')
ax.set_xlabel('')
ax.set_xticklabels(df['Model'], rotation=45, ha='right')
ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.5, label='Random baseline')
ax.legend()
ax.grid(True, alpha=0.3)

# Precision/Recall comparison
ax = axes[0, 1]
x = np.arange(len(df))
width = 0.35
ax.bar(x - width/2, df['Precision'], width, label='Precision', color='green', alpha=0.7)
ax.bar(x + width/2, df['Recall'], width, label='Recall', color='orange', alpha=0.7)
ax.set_title('Precision vs Recall', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(df['Model'], rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3)

# F1 Score comparison
ax = axes[1, 0]
df.plot(x='Model', y='F1', kind='bar', ax=ax, color='purple')
ax.set_title('F1 Score Comparison', fontsize=14, fontweight='bold')
ax.set_ylabel('F1 Score')
ax.set_xlabel('')
ax.set_xticklabels(df['Model'], rotation=45, ha='right')
ax.grid(True, alpha=0.3)

# Cross-validation stability (for ML models)
ax = axes[1, 1]
ml_models = df[df['CV_Std'].notna()]
if not ml_models.empty:
    x = np.arange(len(ml_models))
    ax.bar(x, ml_models['Accuracy'], yerr=ml_models['CV_Std'], 
           capsize=5, alpha=0.7, color='teal')
    ax.set_xticks(x)
    ax.set_xticklabels(ml_models['Model'], rotation=45, ha='right')
    ax.set_ylabel('Accuracy')
    ax.set_title('Cross-Validation Stability', fontsize=14, fontweight='bold')
    ax.grid(True, alpha=0.3)

plt.suptitle(f'Baseline Model Performance ({CHARACTER_MODE} mode)', 
             fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

## 6. Feature Analysis

In [None]:
# Analyze important features from BoW model
if bow_results.feature_importance:
    print(f"Top Killer-Indicative Words ({CHARACTER_MODE} mode):")
    print("="*50)
    
    # Sort by importance
    sorted_features = sorted(bow_results.feature_importance.items(), 
                           key=lambda x: x[1], reverse=True)
    
    # Create visualization
    fig, ax = plt.subplots(figsize=(10, 6))
    
    words = [w for w, _ in sorted_features]
    scores = [s for _, s in sorted_features]
    
    y_pos = np.arange(len(words))
    ax.barh(y_pos, scores, color='crimson', alpha=0.7)
    ax.set_yticks(y_pos)
    ax.set_yticklabels(words)
    ax.set_xlabel('Feature Importance Score')
    ax.set_title(f'Top Words Associated with Killers ({CHARACTER_MODE} mode)', 
                 fontsize=14, fontweight='bold')
    ax.grid(True, alpha=0.3, axis='x')
    
    plt.tight_layout()
    plt.show()
    
    print("\nInterpretation:")
    print("• These words appear more frequently in killer dialogue")
    print("• Higher scores indicate stronger association with killer class")
    print("• This provides linguistic insights into killer speech patterns")

## 7. Summary and Conclusions

In [None]:
# Generate comprehensive summary
print(f"BASELINE ANALYSIS SUMMARY ({CHARACTER_MODE} mode)")
print("="*60)

print("\n1. DATA STATISTICS:")
print(f"   • Total episodes: {len(baselines.episodes)}")
print(f"   • Total unique characters: {baselines.summary_stats['unique_characters']}")
print(f"   • Total sentences: {baselines.summary_stats['total_sentences']}")
print(f"   • Avg sentences/episode: {baselines.summary_stats['avg_sentences_per_episode']:.1f}")
print(f"   • Avg characters/episode: {baselines.summary_stats['avg_characters_per_episode']:.1f}")

print("\n2. OVERALL PERFORMANCE:")
print(f"   • Best model: {results_df.loc[results_df['F1'].idxmax(), 'Model']}")
print(f"   • Best F1 score: {results_df['F1'].max():.3f}")
print(f"   • Average F1 across all models: {results_df['F1'].mean():.3f}")

print("\n3. MODEL TYPE COMPARISON:")
simple_models = ['Frequency Baseline', 'Appearance Order Baseline']
ml_models = ['BoW + Logistic Regression', 'TF-IDF + SVM', 'N-gram Features', 'Combined Features']
simple_perf = results_df[results_df['Model'].isin(simple_models)]['F1'].mean()
ml_perf = results_df[results_df['Model'].isin(ml_models)]['F1'].mean()

print(f"   • Simple heuristics avg F1: {simple_perf:.3f}")
print(f"   • Traditional ML avg F1: {ml_perf:.3f}")
print(f"   • Improvement: {(ml_perf - simple_perf):.3f} ({(ml_perf - simple_perf)/simple_perf*100:.1f}%)")

print("\n4. KEY FINDINGS:")
if results_df['F1'].max() > 0.5:
    print(f"   ✓ Killer prediction is significantly better than random")
if ml_perf > simple_perf:
    print(f"   ✓ Text features improve prediction over simple heuristics")
if CHARACTER_MODE == 'episode-isolated':
    print(f"   • Characters are treated independently per episode")
else:
    print(f"   • Characters are consolidated across episodes")

print("\n5. IMPLICATIONS FOR NEURAL APPROACH:")
print(f"   • Baseline to beat: F1={results_df['F1'].max():.3f}")
print(f"   • Neural embeddings should capture richer semantic patterns")
print(f"   • Character mode '{CHARACTER_MODE}' provides the baseline context")

## 8. Save Results for Later Comparison

In [None]:
# Save results for both modes
import json
from datetime import datetime

output_dir = Path(f'../experiments/baseline_results_{CHARACTER_MODE}')
output_dir.mkdir(parents=True, exist_ok=True)

# Save detailed results
summary = {
    'timestamp': datetime.now().isoformat(),
    'character_mode': CHARACTER_MODE,
    'data_stats': baselines.summary_stats,
    'best_model': results_df.loc[results_df['F1'].idxmax(), 'Model'],
    'best_f1': float(results_df['F1'].max()),
    'average_f1': float(results_df['F1'].mean()),
    'simple_heuristics_avg': float(simple_perf),
    'traditional_ml_avg': float(ml_perf),
    'all_results': results_df.to_dict('records')
}

with open(output_dir / 'baseline_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

# Save DataFrame
results_df.to_csv(output_dir / 'baseline_scores.csv', index=False)

print(f"Results saved to {output_dir}")
print(f"\nFiles created:")
for file in output_dir.glob('*'):
    print(f"  • {file.name}")