# Model Evaluation
Comprehensive evaluation of Two-Tower and Deep Ranking models

In [None]:
import pandas as pd
import numpy as np
import torch
import json
import pickle
from pathlib import Path
import matplotlib.pyplot as plt

# Import evaluation functions
from evaluation import (
    load_trained_models,
    evaluate_metrics_implicit,
    evaluate_hit_ratio,
    evaluate_two_stage_pipeline,
    plot_score_distributions
)

## Load Models and Data

In [None]:
# Load training summary
with open('models/training_summary.json', 'r') as f:
    summary = json.load(f)

# Load feature encoder
with open('models/feature_encoder.pkl', 'rb') as f:
    encoder = pickle.load(f)

# Load data
interactions_df = pd.read_csv('data/enhanced_interactions.csv', parse_dates=['timestamp'])
investor_df = pd.read_csv('data/investor_features.csv')
deal_df = pd.read_csv('data/deal_features.csv')

# Split data (same as training)
interactions_df = interactions_df.sort_values('timestamp')
train_interactions = interactions_df.iloc[:-100]
test_interactions = interactions_df.iloc[-100:]

print(f"Test set: {len(test_interactions)} interactions")

In [None]:
# Load trained models
two_tower_model, deep_ranking_model = load_trained_models(
    two_tower_checkpoint_path=summary['two_tower_best_ckpt'],
    deep_ranking_checkpoint_path=summary['deep_ranking_best_ckpt'],
    n_investors=summary['n_investors'],
    n_deals=summary['n_deals'],
    feature_dims=summary['feature_dims']
)

## Single Model Evaluation

In [None]:
# Evaluate Two-Tower Model
print("Evaluating Two-Tower Model...")
two_tower_metrics = evaluate_metrics_implicit(
    two_tower_model,
    test_interactions,
    train_interactions,
    investor_df,
    deal_df,
    all_deals=np.arange(len(deal_df)),
    K=10
)

print("\nTwo-Tower Metrics:")
for metric, value in two_tower_metrics.items():
    print(f"{metric}: {value:.4f}")

In [None]:
# Evaluate Deep Ranking Model
print("\nEvaluating Deep Ranking Model...")
deep_ranking_metrics = evaluate_metrics_implicit(
    deep_ranking_model,
    test_interactions,
    train_interactions,
    investor_df,
    deal_df,
    all_deals=np.arange(len(deal_df)),
    K=10
)

print("\nDeep Ranking Metrics:")
for metric, value in deep_ranking_metrics.items():
    print(f"{metric}: {value:.4f}")

## Two-Stage Pipeline Evaluation

In [None]:
# Evaluate Two-Stage System
print("\nEvaluating Two-Stage Pipeline...")
two_stage_metrics = evaluate_two_stage_pipeline(
    retrieval_model=two_tower_model,
    ranking_model=deep_ranking_model,
    test_interactions=test_interactions,
    train_interactions=train_interactions,
    investor_df=investor_df,
    deal_df=deal_df,
    all_deals=np.arange(len(deal_df)),
    retrieval_k=100,
    final_k=10
)

print("\nTwo-Stage Pipeline Metrics:")
for metric, value in two_stage_metrics.items():
    print(f"{metric}: {value:.4f}")

## Detailed Analysis

In [None]:
# Compare metrics
metrics_names = list(two_tower_metrics.keys())
x = np.arange(len(metrics_names))
width = 0.25

fig, ax = plt.subplots(figsize=(12, 6))
ax.bar(x - width, [two_tower_metrics[m] for m in metrics_names], 
       width, label='Two-Tower', alpha=0.8)
ax.bar(x, [deep_ranking_metrics[m] for m in metrics_names], 
       width, label='Deep Ranking', alpha=0.8)
ax.bar(x + width, [two_stage_metrics[m] for m in metrics_names], 
       width, label='Two-Stage', alpha=0.8)

ax.set_xlabel('Metrics')
ax.set_ylabel('Score')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x)
ax.set_xticklabels(metrics_names)
ax.legend()

plt.tight_layout()
plt.savefig('models/performance_comparison.png')
plt.show()

## Score Distribution Analysis

In [None]:
# Analyze score distributions for sample investors
sample_investors = test_interactions['investorId'].unique()[:5]

plot_score_distributions(
    two_tower_model,
    deep_ranking_model,
    sample_investors,
    investor_df,
    deal_df,
    all_deals=np.arange(len(deal_df))
)

## Performance by User Segments

In [None]:
# Analyze performance by investor type
results_by_type = {}

investor_types = ['Equity', 'Debt', 'Infrastructure']

for inv_type in investor_df['type'].unique():
    type_investors = investor_df[investor_df['type'] == inv_type].index
    type_interactions = test_interactions[
        test_interactions['investorId'].isin(type_investors)
    ]
    
    if len(type_interactions) > 0:
        metrics = evaluate_metrics_implicit(
            two_tower_model,
            type_interactions,
            train_interactions,
            investor_df,
            deal_df,
            all_deals=np.arange(len(deal_df)),
            K=10
        )
        results_by_type[inv_type] = metrics

print("Performance by Investor Type:")
for inv_type, metrics in results_by_type.items():
    type_name = investor_types[inv_type] if inv_type < len(investor_types) else f"Type {inv_type}"
    print(f"\n{type_name}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")

## Save Evaluation Results

In [None]:
eval_results = {
    'two_tower_metrics': two_tower_metrics,
    'deep_ranking_metrics': deep_ranking_metrics,
    'two_stage_metrics': two_stage_metrics,
    'results_by_type': results_by_type,
    'test_size': len(test_interactions)
}

with open('models/evaluation_results.json', 'w') as f:
    json.dump(eval_results, f, indent=2)

print("\nEvaluation complete! Results saved to models/evaluation_results.json")