In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from scipy import stats
import sys
sys.path.append('..')

from config import CONFIG
from src.evaluation.metrics import precision_at_k, recall_at_k, ndcg_at_k, hit_rate, rmse

# %%
# Load test data and models
print("Loading data and models...")
test_data = pd.read_csv('../data/processed/test.csv')
user_features = pd.read_csv('../data/processed/user_features.csv')
item_features = pd.read_csv('../data/processed/item_features.csv')

with open('../experiments/results/trained_models.pkl', 'rb') as f:
    models = pickle.load(f)

print(f"Test samples: {len(test_data)}")
print(f"Models loaded: {list(models.keys())}")

# %%
# Evaluate all models
print("\n" + "="*50)
print("EVALUATING ALL MODELS")
print("="*50)

K = 10
results = {}

for name, model in models.items():
    print(f"\nEvaluating {name}...")
    
    # Get predictions
    if name == 'hybrid_nn':
        predictions = model.predict(test_data, user_features, item_features)
    else:
        predictions = model.predict(test_data)
    
    # Calculate metrics
    results[name] = {
        'Precision@10': precision_at_k(test_data, predictions, K),
        'Recall@10': recall_at_k(test_data, predictions, K),
        'NDCG@10': ndcg_at_k(test_data, predictions, K),
        'Hit Rate': hit_rate(test_data, predictions, K),
        'RMSE': rmse(test_data['rating'].values, predictions)
    }
    
results_df = pd.DataFrame(results).T
print("\n" + "="*50)
print("RESULTS SUMMARY")
print("="*50)
print(results_df.round(4))

# %%
# Cold-start user evaluation
print("\n" + "="*50)
print("COLD-START USER EVALUATION")
print("="*50)

cold_test = test_data[test_data['is_cold_user'] == True]
warm_test = test_data[test_data['is_cold_user'] == False]

cold_results = {}
for name, model in models.items():
    if name == 'hybrid_nn':
        cold_pred = model.predict(cold_test, user_features, item_features)
    else:
        cold_pred = model.predict(cold_test)
    
    cold_results[name] = {
        'Cold Precision@10': precision_at_k(cold_test, cold_pred, K),
        'Cold Hit Rate': hit_rate(cold_test, cold_pred, K)
    }

cold_df = pd.DataFrame(cold_results).T
print(cold_df.round(4))

# %%
# Hypothesis Testing
print("\n" + "="*50)
print("HYPOTHESIS TESTING")
print("="*50)

# H1: Hybrid NN vs Standard NCF
hybrid_prec = results['hybrid_nn']['Precision@10']
ncf_prec = results['ncf']['Precision@10']
diff = hybrid_prec - ncf_prec

print(f"\nH1: Hybrid NN Precision@10 > NCF Precision@10")
print(f"Hybrid NN: {hybrid_prec:.4f}")
print(f"NCF: {ncf_prec:.4f}")
print(f"Difference: {diff:.4f}")

if diff > 0.03:
    print("✓ HYPOTHESIS SUPPORTED: Difference > 0.03")
else:
    print("✗ HYPOTHESIS NOT SUPPORTED: Difference <= 0.03")

# %%
# Check all sub-hypotheses
print("\n" + "="*50)
print("SUB-HYPOTHESIS VERIFICATION")
print("="*50)

hypotheses = {
    'H1.1: Hit Rate >= 0.70': cold_df.loc['hybrid_nn', 'Cold Hit Rate'] >= 0.70,
    'H1.2: NDCG improvement >= 8%': (results['hybrid_nn']['NDCG@10'] - results['ncf']['NDCG@10']) / results['ncf']['NDCG@10'] >= 0.08,
    'H1.3: Cold user RMSE < 0.90': results['hybrid_nn']['RMSE'] < 0.90,
    'H1.4: Precision@10 >= 0.35': results['hybrid_nn']['Precision@10'] >= 0.35
}

for h, passed in hypotheses.items():
    status = "✓ PASS" if passed else "✗ FAIL"
    print(f"{status}: {h}")

passed_count = sum(hypotheses.values())
print(f"\nTotal: {passed_count}/4 hypotheses passed")

# %%
# Visualization
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 1. Overall performance comparison
ax1 = axes[0, 0]
metrics = ['Precision@10', 'Recall@10', 'NDCG@10', 'Hit Rate']
x = np.arange(len(metrics))
width = 0.15

for i, (name, vals) in enumerate(results.items()):
    ax1.bar(x + i*width, [vals[m] for m in metrics], width, label=name)

ax1.set_ylabel('Score')
ax1.set_title('Model Performance Comparison')
ax1.set_xticks(x + width*2)
ax1.set_xticklabels(metrics, rotation=45)
ax1.legend()
ax1.axhline(y=0.35, color='r', linestyle='--', label='Target')

# 2. Cold-start performance
ax2 = axes[0, 1]
cold_df.plot(kind='bar', ax=ax2)
ax2.set_title('Cold-Start User Performance')
ax2.set_ylabel('Score')
ax2.axhline(y=0.70, color='r', linestyle='--')
ax2.legend(loc='upper right')

# 3. RMSE comparison
ax3 = axes[1, 0]
rmse_values = [results[m]['RMSE'] for m in results.keys()]
colors = ['green' if v < 0.90 else 'red' for v in rmse_values]
ax3.bar(results.keys(), rmse_values, color=colors)
ax3.set_title('RMSE Comparison')
ax3.set_ylabel('RMSE')
ax3.axhline(y=0.90, color='r', linestyle='--', label='Target')

# 4. Improvement over baselines
ax4 = axes[1, 1]
baseline = results['random']['Precision@10']
improvements = [(results[m]['Precision@10'] - baseline) / baseline * 100 for m in results.keys()]
ax4.bar(results.keys(), improvements)
ax4.set_title('Improvement over Random Baseline (%)')
ax4.set_ylabel('Improvement %')

plt.tight_layout()
plt.savefig('../experiments/results/evaluation_results.png', dpi=150)
plt.show()

# %%
# Save final results
print("\n" + "="*50)
print("SAVING RESULTS")
print("="*50)

# Combine all results
final_results = {
    'overall': results_df,
    'cold_start': cold_df,
    'hypotheses': hypotheses
}

results_df.to_csv('../experiments/results/final_results.csv')
cold_df.to_csv('../experiments/results/cold_start_results.csv')

with open('../experiments/results/hypothesis_results.pkl', 'wb') as f:
    pickle.dump(hypotheses, f)

print("✓ Results saved!")
print("\n" + "="*50)
print("EXPERIMENT COMPLETE")
print("="*50)