<a href="https://colab.research.google.com/github/fikrifaizz/Real-Time-Fraud-Detection-System/blob/main/notebooks/04_threshold_optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import pickle
import json
import os

In [2]:
# Load ensemble config
with open('../models/ensemble_config.json', 'r') as f:
    ensemble_config = json.load(f)

# Load models
with open('../models/lightgbm_model.pkl', 'rb') as f:
    lgb_model = pickle.load(f)

with open('../models/xgboost_model.pkl', 'rb') as f:
    xgb_model = pickle.load(f)

with open('../models/catboost_model.pkl', 'rb') as f:
    cat_model = pickle.load(f)

In [3]:
# Load validation data for explanation
val_df = pd.read_csv('../data/processed/val_set.csv')

drop_cols = ['TransactionID', 'TransactionDT', 'isFraud']
feature_cols = [col for col in val_df.columns if col not in drop_cols]

X_val = val_df[feature_cols]
y_val = val_df['isFraud']

print(f"Validation data: {len(X_val):,} rows, {len(feature_cols)} features")

Validation data: 118,108 rows, 443 features


In [4]:
print("Creating SHAP Explainer...")

# Use TreeExplainer for LightGBM (fastest)
explainer = shap.TreeExplainer(lgb_model)

# Calculate SHAP values on sample (1000 samples untuk speed)
sample_size = 1000
X_sample = X_val.sample(n=sample_size, random_state=42)

print(f"Computing SHAP values for {sample_size} samples...")
shap_values = explainer.shap_values(X_sample)

# For binary classification, shap_values is a list [class0, class1]
# We want class 1 (fraud)
if isinstance(shap_values, list):
    shap_values = shap_values[1]

print("SHAP values computed")

Creating SHAP Explainer...
Computing SHAP values for 1000 samples...
SHAP values computed




In [7]:
os.makedirs('../outputs', exist_ok=True)

In [8]:
print("Global Feature Importance")

# Summary plot (top features)
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_sample, show=False, max_display=20)
plt.tight_layout()
plt.savefig('../outputs/shap_summary.png', dpi=100, bbox_inches='tight')
plt.close()

# Feature importance bar plot
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_sample, plot_type='bar', show=False, max_display=20)
plt.tight_layout()
plt.savefig('../outputs/shap_importance.png', dpi=100, bbox_inches='tight')
plt.close()
print("Saved: outputs/shap_importance.png")

# Get mean absolute SHAP values
shap_importance = pd.DataFrame({
    'feature': X_sample.columns,
    'importance': np.abs(shap_values).mean(axis=0)
}).sort_values('importance', ascending=False)

print("\nTop 15 Most Important Features:")
print(shap_importance.head(15).to_string(index=False))

Global Feature Importance
Saved: outputs/shap_importance.png

Top 15 Most Important Features:
            feature  importance
                day    0.293005
                C13    0.205416
                V70    0.190232
      card_txn_mean    0.162564
     TransactionAmt    0.159196
     card_txn_count    0.136230
P_emaildomain_count    0.126994
                 M4    0.126804
              card6    0.120098
                C14    0.109084
              card1    0.107808
                 C1    0.105843
                 C2    0.096269
                 C5    0.096191
       card_txn_min    0.092623


In [9]:
print("Explaining FRAUD Example")

# Get fraud samples
fraud_indices = val_df[val_df['isFraud'] == 1].index[:5]
fraud_sample = val_df.loc[fraud_indices, feature_cols]

# Predict
fraud_proba = lgb_model.predict_proba(fraud_sample)[:, 1]

# Explain first fraud case
fraud_shap = explainer.shap_values(fraud_sample.iloc[[0]])
if isinstance(fraud_shap, list):
    fraud_shap = fraud_shap[1]

print(f"\nFraud Transaction Example:")
print(f"   Fraud Probability: {fraud_proba[0]:.2%}")
print(f"   Actual Label: FRAUD")

# Get top contributing features
fraud_contrib = pd.DataFrame({
    'feature': fraud_sample.columns,
    'value': fraud_sample.iloc[0].values,
    'shap_value': fraud_shap[0]
}).sort_values('shap_value', key=abs, ascending=False).head(10)

print("\n   Top Risk Factors:")
for idx, row in fraud_contrib.iterrows():
    direction = "INCREASE" if row['shap_value'] > 0 else "DECREASE"
    print(f"   - {row['feature']}: {row['value']:.2f} ({direction} fraud risk)")

# Waterfall plot
plt.figure(figsize=(10, 8))
shap.waterfall_plot(
    shap.Explanation(
        values=fraud_shap[0],
        base_values=explainer.expected_value[1] if isinstance(explainer.expected_value, list) else explainer.expected_value,
        data=fraud_sample.iloc[0].values,
        feature_names=fraud_sample.columns.tolist()
    ),
    show=False,
    max_display=15
)
plt.tight_layout()
plt.savefig('../outputs/shap_fraud_waterfall.png', dpi=100, bbox_inches='tight')
plt.close()
print("\nSaved: ../outputs/shap_fraud_waterfall.png")

Explaining FRAUD Example

Fraud Transaction Example:
   Fraud Probability: 95.60%
   Actual Label: FRAUD

   Top Risk Factors:
   - C14: 0.00 (INCREASE fraud risk)
   - ProductCD: 0.00 (INCREASE fraud risk)
   - day: 101.00 (INCREASE fraud risk)
   - C13: 0.00 (INCREASE fraud risk)
   - id_09: 1.00 (INCREASE fraud risk)
   - card3: 185.00 (INCREASE fraud risk)
   - card6: 2.00 (INCREASE fraud risk)
   - card_txn_count: 1013.00 (INCREASE fraud risk)
   - TransactionAmt: 148.39 (INCREASE fraud risk)
   - V70: 0.00 (INCREASE fraud risk)





Saved: ../outputs/shap_fraud_waterfall.png


In [10]:
print("Explaining LEGITIMATE Example")

# Get legitimate samples
legit_indices = val_df[val_df['isFraud'] == 0].index[:5]
legit_sample = val_df.loc[legit_indices, feature_cols]

# Predict
legit_proba = lgb_model.predict_proba(legit_sample)[:, 1]

# Explain first legitimate case
legit_shap = explainer.shap_values(legit_sample.iloc[[0]])
if isinstance(legit_shap, list):
    legit_shap = legit_shap[1]

print(f"\nLegitimate Transaction Example:")
print(f"   Fraud Probability: {legit_proba[0]:.2%}")
print(f"   Actual Label: LEGITIMATE")

# Get top contributing features
legit_contrib = pd.DataFrame({
    'feature': legit_sample.columns,
    'value': legit_sample.iloc[0].values,
    'shap_value': legit_shap[0]
}).sort_values('shap_value', key=abs, ascending=False).head(10)

print("\n   Top Safety Factors:")
for idx, row in legit_contrib.iterrows():
    direction = "INCREASE" if row['shap_value'] > 0 else "DECREASE"
    print(f"   - {row['feature']}: {row['value']:.2f} ({direction} fraud risk)")

Explaining LEGITIMATE Example

Legitimate Transaction Example:
   Fraud Probability: 20.48%
   Actual Label: LEGITIMATE

   Top Safety Factors:
   - V70: 1.00 (DECREASE fraud risk)
   - day: 101.00 (INCREASE fraud risk)
   - D4: 312.00 (DECREASE fraud risk)
   - TransactionAmt: 226.00 (INCREASE fraud risk)
   - card_txn_mean: 119.87 (INCREASE fraud risk)
   - C13: 11.00 (DECREASE fraud risk)
   - card_txn_count: 2067.00 (INCREASE fraud risk)
   - card1: 4806.00 (INCREASE fraud risk)
   - C9: 3.00 (INCREASE fraud risk)
   - D15: 365.00 (DECREASE fraud risk)




In [11]:
print("Creating Dependence Plots...")

# Top 3 features for dependence plots
top_features = shap_importance.head(3)['feature'].tolist()

for i, feature in enumerate(top_features):
    plt.figure(figsize=(10, 6))
    shap.dependence_plot(
        feature, 
        shap_values, 
        X_sample, 
        show=False
    )
    plt.tight_layout()
    plt.savefig(f'../outputs/shap_dependence_{feature}.png', dpi=100, bbox_inches='tight')
    plt.close()
    print(f"Saved: outputs/shap_dependence_{feature}.png")

Creating Dependence Plots...
Saved: outputs/shap_dependence_day.png
Saved: outputs/shap_dependence_C13.png
Saved: outputs/shap_dependence_V70.png


In [12]:
print("Saving SHAP Explainer...")

# Save explainer for API use
with open('../models/shap_explainer.pkl', 'wb') as f:
    pickle.dump(explainer, f)

print("Saved: ../models/shap_explainer.pkl")

# Save top features for quick reference
top_features_config = {
    'top_20_features': shap_importance.head(20).to_dict('records'),
    'feature_names': feature_cols
}

with open('../models/top_features.json', 'w') as f:
    json.dump(top_features_config, f, indent=2)

print("Saved: ../models/top_features.json")

Saving SHAP Explainer...
Saved: ../models/shap_explainer.pkl
Saved: ../models/top_features.json


In [13]:
print("Creating Explanation Helper...")

def explain_prediction(transaction_data, model, explainer, top_n=5):
    # Predict
    fraud_proba = model.predict_proba(transaction_data)[:, 1][0]
    
    # Get SHAP values
    shap_vals = explainer.shap_values(transaction_data)
    if isinstance(shap_vals, list):
        shap_vals = shap_vals[1]
    
    # Get top contributing features
    feature_contrib = pd.DataFrame({
        'feature': transaction_data.columns,
        'value': transaction_data.iloc[0].values,
        'shap_value': shap_vals[0]
    }).sort_values('shap_value', key=abs, ascending=False).head(top_n)
    
    # Format explanation
    risk_factors = []
    for _, row in feature_contrib.iterrows():
        risk_factors.append({
            'feature': row['feature'],
            'value': float(row['value']),
            'impact': 'increase' if row['shap_value'] > 0 else 'decrease',
            'shap_value': float(row['shap_value'])
        })
    
    return {
        'fraud_probability': float(fraud_proba),
        'prediction': 'FRAUD' if fraud_proba >= ensemble_config['threshold'] else 'LEGITIMATE',
        'risk_level': 'HIGH' if fraud_proba >= 0.7 else 'MEDIUM' if fraud_proba >= 0.3 else 'LOW',
        'top_risk_factors': risk_factors
    }

# Test the function
print("\nTesting Explanation Function...")
test_transaction = X_val.iloc[[0]]
explanation = explain_prediction(test_transaction, lgb_model, explainer)

print(f"\nExample Explanation:")
print(f"   Prediction: {explanation['prediction']}")
print(f"   Probability: {explanation['fraud_probability']:.2%}")
print(f"   Risk Level: {explanation['risk_level']}")
print(f"\n   Top Risk Factors:")
for factor in explanation['top_risk_factors']:
    print(f"   - {factor['feature']}: {factor['value']:.2f} ({factor['impact']})")

Creating Explanation Helper...

Testing Explanation Function...

Example Explanation:
   Prediction: LEGITIMATE
   Probability: 20.48%
   Risk Level: LOW

   Top Risk Factors:
   - V70: 1.00 (decrease)
   - day: 101.00 (increase)
   - D4: 312.00 (decrease)
   - TransactionAmt: 226.00 (increase)
   - card_txn_mean: 119.87 (increase)




In [15]:
with open('../src/utils/explainer.py', 'w') as f:
    f.write('''"""
SHAP Explanation Helper
"""
import pandas as pd

def explain_prediction(transaction_data, model, explainer, top_n=5):
    """
    Explain a single transaction prediction
    """
    # Predict
    fraud_proba = model.predict_proba(transaction_data)[:, 1][0]
    
    # Get SHAP values
    shap_vals = explainer.shap_values(transaction_data)
    if isinstance(shap_vals, list):
        shap_vals = shap_vals[1]
    
    # Get top contributing features
    feature_contrib = pd.DataFrame({
        'feature': transaction_data.columns,
        'value': transaction_data.iloc[0].values,
        'shap_value': shap_vals[0]
    }).sort_values('shap_value', key=abs, ascending=False).head(top_n)
    
    # Format explanation
    risk_factors = []
    for _, row in feature_contrib.iterrows():
        risk_factors.append({
            'feature': row['feature'],
            'value': float(row['value']),
            'impact': 'increase' if row['shap_value'] > 0 else 'decrease',
            'shap_value': float(row['shap_value'])
        })
    
    return {
        'fraud_probability': float(fraud_proba),
        'prediction': 'FRAUD' if fraud_proba >= 0.5 else 'LEGITIMATE',
        'risk_level': 'HIGH' if fraud_proba >= 0.7 else 'MEDIUM' if fraud_proba >= 0.3 else 'LOW',
        'top_risk_factors': risk_factors
    }
''')