# Error Analysis

**Purpose**: Analyze model mistakes to identify improvement opportunities

This notebook provides:
- False positive analysis
- False negative analysis
- Error patterns by features
- Misclassified example inspection
- Threshold optimization
- Segment-specific performance

## Setup

In [None]:
import sys
sys.path.insert(0, '../')

from packages.training import FeatureExtractor, FeatureBuilder, ModelTrainer
from packages.storage import ClientFactory, get_connection_params
from notebook_utils import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from loguru import logger
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

setup_plotting()

## Configuration

In [None]:
NETWORK = 'ethereum'
START_DATE = '2024-01-01'
END_DATE = '2024-02-29'
WINDOW_DAYS = 7
TEST_SIZE = 0.2
RANDOM_STATE = 42
THRESHOLD = 0.5

print(f"Network: {NETWORK}")
print(f"Analysis Period: {START_DATE} to {END_DATE}")
print(f"Classification Threshold: {THRESHOLD}")

## Load Data and Train Model

In [None]:
connection_params = get_connection_params(NETWORK)
client_factory = ClientFactory(connection_params)

with client_factory.client_context() as client:
    extractor = FeatureExtractor(client)
    data = extractor.extract_training_data(
        start_date=START_DATE,
        end_date=END_DATE,
        window_days=WINDOW_DAYS
    )

builder = FeatureBuilder()
X, y = builder.build_training_features(data)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

print(f"Data loaded: {X.shape}")

In [None]:
trainer = ModelTrainer(model_type='alert_scorer')
model, metrics = trainer.train(X_train, y_train, cv_folds=5)

y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > THRESHOLD).astype(int)

print(f"Model trained with AUC: {metrics['test_auc']:.4f}")

## Confusion Matrix Analysis

In [None]:
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

print("\n" + "="*50)
print("CONFUSION MATRIX BREAKDOWN")
print("="*50)
print(f"True Negatives (TN):  {tn:6d} (Correctly predicted low risk)")
print(f"False Positives (FP): {fp:6d} (Incorrectly predicted high risk)")
print(f"False Negatives (FN): {fn:6d} (Incorrectly predicted low risk)")
print(f"True Positives (TP):  {tp:6d} (Correctly predicted high risk)")
print("="*50)
print(f"Total Errors: {fp + fn} ({(fp + fn) / len(y_test) * 100:.2f}%)")
print(f"FP Rate: {fp / (fp + tn) * 100:.2f}%")
print(f"FN Rate: {fn / (fn + tp) * 100:.2f}%")
print("="*50)

plot_confusion_matrix(y_test, y_pred, labels=['Low Risk', 'High Risk'])
plt.show()

## Identify Misclassified Samples

In [None]:
error_df = pd.DataFrame({
    'actual': y_test,
    'predicted': y_pred,
    'probability': y_pred_proba
}, index=y_test.index)

error_df['error_type'] = 'Correct'
error_df.loc[(error_df['actual'] == 0) & (error_df['predicted'] == 1), 'error_type'] = 'False Positive'
error_df.loc[(error_df['actual'] == 1) & (error_df['predicted'] == 0), 'error_type'] = 'False Negative'

print("\nError Distribution:")
print(error_df['error_type'].value_counts())
print(f"\nAccuracy: {(error_df['error_type'] == 'Correct').sum() / len(error_df) * 100:.2f}%")

## False Positive Analysis

In [None]:
fp_indices = error_df[error_df['error_type'] == 'False Positive'].index
tn_indices = error_df[(error_df['actual'] == 0) & (error_df['predicted'] == 0)].index

print(f"\nAnalyzing {len(fp_indices)} False Positives")
print(f"Comparing with {len(tn_indices)} True Negatives")

fp_scores = error_df.loc[fp_indices, 'probability']
print(f"\nFalse Positive Score Statistics:")
print(f"  Mean: {fp_scores.mean():.4f}")
print(f"  Median: {fp_scores.median():.4f}")
print(f"  Min: {fp_scores.min():.4f}")
print(f"  Max: {fp_scores.max():.4f}")

In [None]:
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(error_df.loc[tn_indices, 'probability'], bins=30, alpha=0.6, 
         label='True Negatives', edgecolor='black')
plt.hist(fp_scores, bins=30, alpha=0.6, 
         label='False Positives', edgecolor='black')
plt.axvline(x=THRESHOLD, color='r', linestyle='--', label='Threshold')
plt.xlabel('Predicted Probability')
plt.ylabel('Frequency')
plt.title('False Positive Score Distribution')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.boxplot([error_df.loc[tn_indices, 'probability'], fp_scores],
            labels=['True Negatives', 'False Positives'])
plt.axhline(y=THRESHOLD, color='r', linestyle='--', label='Threshold')
plt.ylabel('Predicted Probability')
plt.title('Score Comparison Box Plot')
plt.legend()
plt.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## False Negative Analysis

In [None]:
fn_indices = error_df[error_df['error_type'] == 'False Negative'].index
tp_indices = error_df[(error_df['actual'] == 1) & (error_df['predicted'] == 1)].index

print(f"\nAnalyzing {len(fn_indices)} False Negatives")
print(f"Comparing with {len(tp_indices)} True Positives")

fn_scores = error_df.loc[fn_indices, 'probability']
print(f"\nFalse Negative Score Statistics:")
print(f"  Mean: {fn_scores.mean():.4f}")
print(f"  Median: {fn_scores.median():.4f}")
print(f"  Min: {fn_scores.min():.4f}")
print(f"  Max: {fn_scores.max():.4f}")

In [None]:
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(error_df.loc[tp_indices, 'probability'], bins=30, alpha=0.6,
         label='True Positives', edgecolor='black')
plt.hist(fn_scores, bins=30, alpha=0.6,
         label='False Negatives', edgecolor='black')
plt.axvline(x=THRESHOLD, color='r', linestyle='--', label='Threshold')
plt.xlabel('Predicted Probability')
plt.ylabel('Frequency')
plt.title('False Negative Score Distribution')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.boxplot([error_df.loc[tp_indices, 'probability'], fn_scores],
            labels=['True Positives', 'False Negatives'])
plt.axhline(y=THRESHOLD, color='r', linestyle='--', label='Threshold')
plt.ylabel('Predicted Probability')
plt.title('Score Comparison Box Plot')
plt.legend()
plt.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## Feature Patterns in Errors

In [None]:
feature_cols = X_test.columns[:10]

fig, axes = plt.subplots(2, 5, figsize=(18, 8))
axes = axes.flatten()

for idx, feature in enumerate(feature_cols):
    if len(fp_indices) > 0:
        axes[idx].hist(X_test.loc[tn_indices, feature], bins=20, alpha=0.5, 
                      label='TN', edgecolor='black')
        axes[idx].hist(X_test.loc[fp_indices, feature], bins=20, alpha=0.5,
                      label='FP', edgecolor='black')
    axes[idx].set_title(f'{feature}', fontsize=9)
    axes[idx].set_xlabel('Value', fontsize=8)
    axes[idx].set_ylabel('Count', fontsize=8)
    axes[idx].legend(fontsize=7)
    axes[idx].tick_params(labelsize=7)

plt.suptitle('Feature Distributions: True Negatives vs False Positives')
plt.tight_layout()
plt.show()

## Misclassification Examples

In [None]:
if len(fp_indices) > 0:
    print("\n=== HIGH CONFIDENCE FALSE POSITIVES ===")
    print("(Low risk alerts incorrectly predicted as high risk)\n")
    
    high_conf_fp = error_df.loc[fp_indices].nlargest(5, 'probability')
    for idx, row in high_conf_fp.iterrows():
        print(f"Sample {idx}:")
        print(f"  Predicted Probability: {row['probability']:.4f}")
        print(f"  Actual: Low Risk, Predicted: High Risk")
        print(f"  Top 5 Feature Values:")
        sample_features = X_test.loc[idx].nlargest(5)
        for feat, val in sample_features.items():
            print(f"    {feat}: {val:.4f}")
        print()

In [None]:
if len(fn_indices) > 0:
    print("\n=== HIGH CONFIDENCE FALSE NEGATIVES ===")
    print("(High risk alerts incorrectly predicted as low risk)\n")
    
    high_conf_fn = error_df.loc[fn_indices].nsmallest(5, 'probability')
    for idx, row in high_conf_fn.iterrows():
        print(f"Sample {idx}:")
        print(f"  Predicted Probability: {row['probability']:.4f}")
        print(f"  Actual: High Risk, Predicted: Low Risk")
        print(f"  Top 5 Feature Values:")
        sample_features = X_test.loc[idx].nlargest(5)
        for feat, val in sample_features.items():
            print(f"    {feat}: {val:.4f}")
        print()

## Threshold Optimization

In [None]:
thresholds = np.arange(0.1, 0.9, 0.02)
fp_rates = []
fn_rates = []
accuracies = []

for thresh in thresholds:
    y_pred_thresh = (y_pred_proba > thresh).astype(int)
    cm_thresh = confusion_matrix(y_test, y_pred_thresh)
    tn_t, fp_t, fn_t, tp_t = cm_thresh.ravel()
    
    fp_rate = fp_t / (fp_t + tn_t) if (fp_t + tn_t) > 0 else 0
    fn_rate = fn_t / (fn_t + tp_t) if (fn_t + tp_t) > 0 else 0
    accuracy = (tp_t + tn_t) / len(y_test)
    
    fp_rates.append(fp_rate)
    fn_rates.append(fn_rate)
    accuracies.append(accuracy)

plt.figure(figsize=(12, 6))
plt.plot(thresholds, fp_rates, label='False Positive Rate', linewidth=2)
plt.plot(thresholds, fn_rates, label='False Negative Rate', linewidth=2)
plt.plot(thresholds, accuracies, label='Accuracy', linewidth=2)
plt.axvline(x=THRESHOLD, color='r', linestyle='--', label=f'Current Threshold ({THRESHOLD})')
plt.xlabel('Threshold')
plt.ylabel('Rate')
plt.title('Error Rates vs Classification Threshold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

best_acc_idx = np.argmax(accuracies)
print(f"\nBest threshold for accuracy: {thresholds[best_acc_idx]:.2f}")
print(f"  Accuracy: {accuracies[best_acc_idx]:.4f}")
print(f"  FP Rate: {fp_rates[best_acc_idx]:.4f}")
print(f"  FN Rate: {fn_rates[best_acc_idx]:.4f}")

## Error by Score Bins

In [None]:
error_df['score_bin'] = pd.cut(error_df['probability'], bins=10, labels=False)

bin_analysis = error_df.groupby('score_bin').agg({
    'error_type': lambda x: (x != 'Correct').sum(),
    'actual': 'count'
}).rename(columns={'error_type': 'errors', 'actual': 'total'})

bin_analysis['error_rate'] = bin_analysis['errors'] / bin_analysis['total']
bin_analysis['bin_center'] = [(i + 0.5) / 10 for i in bin_analysis.index]

plt.figure(figsize=(10, 6))
plt.bar(bin_analysis['bin_center'], bin_analysis['error_rate'], width=0.08, alpha=0.7)
plt.xlabel('Predicted Probability Bin')
plt.ylabel('Error Rate')
plt.title('Error Rate by Score Bin')
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

print("\nError Analysis by Score Bin:")
print(bin_analysis[['total', 'errors', 'error_rate']].round(4))

## Segment Performance Analysis

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

if len(X_test.columns) > 0:
    top_feature = X_test.columns[0]
    feature_median = X_test[top_feature].median()
    
    segments = {
        f'{top_feature} < median': X_test[top_feature] < feature_median,
        f'{top_feature} >= median': X_test[top_feature] >= feature_median
    }
    
    segment_performance = []
    for seg_name, seg_mask in segments.items():
        if seg_mask.sum() > 0:
            y_seg = y_test[seg_mask]
            y_pred_seg = y_pred[seg_mask]
            
            segment_performance.append({
                'Segment': seg_name,
                'Size': seg_mask.sum(),
                'Accuracy': accuracy_score(y_seg, y_pred_seg),
                'Precision': precision_score(y_seg, y_pred_seg, zero_division=0),
                'Recall': recall_score(y_seg, y_pred_seg, zero_division=0),
                'F1': f1_score(y_seg, y_pred_seg, zero_division=0)
            })
    
    segment_df = pd.DataFrame(segment_performance)
    print("\nPerformance by Segment:")
    print(segment_df.round(4).to_string(index=False))

## Error Summary

In [None]:
print("\n" + "="*70)
print("ERROR ANALYSIS SUMMARY")
print("="*70)
print(f"\nTotal Test Samples: {len(y_test)}")
print(f"Correct Predictions: {(error_df['error_type'] == 'Correct').sum()}")
print(f"Total Errors: {(error_df['error_type'] != 'Correct').sum()}")
print(f"\nError Breakdown:")
print(f"  False Positives: {len(fp_indices)} ({len(fp_indices)/len(y_test)*100:.2f}%)")
print(f"  False Negatives: {len(fn_indices)} ({len(fn_indices)/len(y_test)*100:.2f}%)")
print(f"\nError Impact:")
print(f"  FP Rate (of actual negatives): {fp/(fp+tn)*100:.2f}%")
print(f"  FN Rate (of actual positives): {fn/(fn+tp)*100:.2f}%")
print("="*70)

## Conclusions

**Key Error Patterns**:

1. **False Positives**: Review common characteristics
2. **False Negatives**: Identify missed high-risk cases
3. **Threshold Impact**: Understand FP/FN trade-off
4. **Score Confidence**: Analyze errors by prediction confidence

**Improvement Opportunities**:
- Feature engineering to better distinguish error cases
- Threshold adjustment for business requirements
- Segment-specific models if performance varies significantly
- Additional data collection for challenging cases

**Next Steps**:
- Implement threshold based on business priorities (FP vs FN cost)
- Consider ensemble methods to reduce specific error types
- Review misclassified samples for data quality issues
- Monitor error patterns in production