# Error Analysis

Analyze prediction errors to understand model weaknesses:
1. Load models and predictions
2. Identify misclassified papers
3. Analyze false positives and false negatives
4. Compare characteristics of correct vs incorrect predictions

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
from pathlib import Path
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
%matplotlib inline

# Create figures directory
figures_dir = Path('../reports/figures')
figures_dir.mkdir(parents=True, exist_ok=True)
print(f"Figures will be saved to: {figures_dir}")

## 1. Load Data and Models

In [None]:
X_test = pd.read_pickle('../data/features/X_test_temporal.pkl')
y_test_cls = pd.read_pickle('../data/features/y_test_cls_temporal.pkl')
y_test_reg = pd.read_pickle('../data/features/y_test_reg_temporal.pkl')
metadata_test = pd.read_pickle('../data/features/metadata_test.pkl')

with open('../models/classification/lightgbm.pkl', 'rb') as f:
    clf_model = pickle.load(f)

with open('../models/regression/random_forest.pkl', 'rb') as f:
    reg_model = pickle.load(f)

y_reg_raw = pd.read_pickle('../data/features/y_regression.pkl')
y_test_reg_raw = y_reg_raw[metadata_test.index]

print(f"Test set: {len(X_test)} papers")
print(f"High-impact: {y_test_cls.sum()} ({y_test_cls.mean()*100:.1f}%)")

## 2. Generate Predictions

In [None]:
y_pred_cls = clf_model.predict(X_test)
y_pred_proba = clf_model.predict_proba(X_test)[:, 1]
y_pred_reg = reg_model.predict(X_test)

results_df = metadata_test.copy()
results_df['actual_citations'] = y_test_reg_raw.values
results_df['actual_log_citations'] = y_test_reg.values
results_df['predicted_log_citations'] = y_pred_reg
results_df['predicted_citations'] = np.expm1(y_pred_reg)
results_df['actual_high_impact'] = y_test_cls.values
results_df['predicted_high_impact'] = y_pred_cls
results_df['high_impact_probability'] = y_pred_proba
results_df['prediction_error'] = np.abs(y_test_reg.values - y_pred_reg)

print("Results dataframe created")
results_df.head()

## 3. Classification Errors

In [None]:
results_df['classification_status'] = 'Correct'
results_df.loc[(results_df['actual_high_impact'] == 1) & (results_df['predicted_high_impact'] == 0), 'classification_status'] = 'False Negative'
results_df.loc[(results_df['actual_high_impact'] == 0) & (results_df['predicted_high_impact'] == 1), 'classification_status'] = 'False Positive'

print("Classification Results:")
print(results_df['classification_status'].value_counts())
print(f"\nAccuracy: {(results_df['classification_status'] == 'Correct').mean()*100:.2f}%")

## 4. Analyze False Positives

In [None]:
false_positives = results_df[results_df['classification_status'] == 'False Positive'].sort_values('high_impact_probability', ascending=False)

print(f"False Positives: {len(false_positives)}")
print(f"\nTop 10 False Positives (model was most confident):")
print(false_positives[['Title', 'Year', 'actual_citations', 'predicted_citations', 'high_impact_probability']].head(10))

print(f"\nFalse Positive Statistics:")
print(f"Mean actual citations: {false_positives['actual_citations'].mean():.1f}")
print(f"Mean predicted citations: {false_positives['predicted_citations'].mean():.1f}")
print(f"Mean confidence: {false_positives['high_impact_probability'].mean():.3f}")

## 5. Analyze False Negatives

In [None]:
false_negatives = results_df[results_df['classification_status'] == 'False Negative'].sort_values('high_impact_probability', ascending=True)

print(f"False Negatives: {len(false_negatives)}")
if len(false_negatives) > 0:
    print(f"\nTop 10 False Negatives (model was least confident):")
    print(false_negatives[['Title', 'Year', 'actual_citations', 'predicted_citations', 'high_impact_probability']].head(10))
    
    print(f"\nFalse Negative Statistics:")
    print(f"Mean actual citations: {false_negatives['actual_citations'].mean():.1f}")
    print(f"Mean predicted citations: {false_negatives['predicted_citations'].mean():.1f}")
    print(f"Mean confidence: {false_negatives['high_impact_probability'].mean():.3f}")
else:
    print("No false negatives! Model catches all high-impact papers.")

## 6. Regression Errors

In [None]:
top_errors = results_df.nlargest(20, 'prediction_error')

print("Top 20 Largest Prediction Errors:")
print(top_errors[['Title', 'Year', 'actual_citations', 'predicted_citations', 'prediction_error']])

print(f"\nMean absolute error (log scale): {results_df['prediction_error'].mean():.4f}")
print(f"Median absolute error (log scale): {results_df['prediction_error'].median():.4f}")

## 7. Error Distribution by Citation Range

In [None]:
results_df['citation_bin'] = pd.cut(results_df['actual_citations'], bins=[0, 5, 10, 25, 50, 100, 1000, 100000], labels=['0-5', '6-10', '11-25', '26-50', '51-100', '101-1000', '1000+'])

error_by_bin = results_df.groupby('citation_bin')['prediction_error'].agg(['mean', 'median', 'count'])
print("Prediction Error by Citation Range:")
print(error_by_bin)

fig, ax = plt.subplots(figsize=(10, 6))
results_df.boxplot(column='prediction_error', by='citation_bin', ax=ax)
ax.set_xlabel('Citation Range')
ax.set_ylabel('Prediction Error (log scale)')
ax.set_title('Prediction Error Distribution by Citation Range')
plt.sca(ax)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(figures_dir / 'error_by_citation_range.png', dpi=300, bbox_inches='tight')
print(f"Saved: {figures_dir / 'error_by_citation_range.png'}")
plt.show()

## 8. Overestimation vs Underestimation

In [None]:
results_df['signed_error'] = y_pred_reg - y_test_reg.values
results_df['error_type'] = 'Accurate'
results_df.loc[results_df['signed_error'] > 0.5, 'error_type'] = 'Overestimated'
results_df.loc[results_df['signed_error'] < -0.5, 'error_type'] = 'Underestimated'

print("Prediction Bias:")
print(results_df['error_type'].value_counts())
print(f"\nMean signed error: {results_df['signed_error'].mean():.4f}")
print(f"(Positive = overestimation, Negative = underestimation)")

fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(results_df['signed_error'], bins=50, edgecolor='black', alpha=0.7)
ax.axvline(0, color='red', linestyle='--', linewidth=2, label='Perfect prediction')
ax.set_xlabel('Signed Prediction Error (log scale)')
ax.set_ylabel('Count')
ax.set_title('Distribution of Signed Prediction Errors')
ax.legend()
plt.tight_layout()
plt.savefig(figures_dir / 'signed_error_distribution.png', dpi=300, bbox_inches='tight')
print(f"Saved: {figures_dir / 'signed_error_distribution.png'}")
plt.show()

## Summary

In [None]:
print("=" * 60)
print("ERROR ANALYSIS SUMMARY")
print("=" * 60)
print(f"\nClassification Errors:")
print(f"  False Positives: {len(false_positives)} (predicted high-impact, but actually low)")
print(f"  False Negatives: {len(false_negatives)} (predicted low-impact, but actually high)")
print(f"  Accuracy: {(results_df['classification_status'] == 'Correct').mean()*100:.2f}%")
print(f"\nRegression Errors:")
print(f"  Mean absolute error: {results_df['prediction_error'].mean():.4f} (log scale)")
print(f"  Mean signed error: {results_df['signed_error'].mean():.4f}")
if results_df['signed_error'].mean() > 0:
    print(f"  Model tends to OVERESTIMATE citation counts")
else:
    print(f"  Model tends to UNDERESTIMATE citation counts")
print(f"\nKey Insight: Where does the model struggle most?")
print(error_by_bin)