In [1]:
!pip install pandas numpy scikit-learn xgboost shap imbalanced-learn matplotlib seaborn joblib



In [2]:
# === Import libraries ===

In [3]:
import sys, pandas, numpy, sklearn, xgboost, shap, imblearn, matplotlib, seaborn, joblib
print("Python:", sys.version)
print("pandas:", pandas.__version__)
print("numpy:", numpy.__version__)
print("scikit-learn:", sklearn.__version__)
print("xgboost:", xgboost.__version__)
print("shap:", shap.__version__)
print("imbalanced-learn:", imblearn.__version__)

ImportError: cannot import name 'parse_version' from 'sklearn.utils' (/usr/local/lib/python3.11/dist-packages/sklearn/utils/__init__.py)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# === Load dataset ===

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Python Project/creditcard_2023.csv')

In [None]:
# === Drop ID column ===

In [None]:
if 'id' in df.columns:
    df.drop(columns=['id'], inplace=True)

In [None]:
# === EDA (SAMPLE ONLY 5000 ROWS) ===

In [None]:
eda_df = df.sample(5000, random_state=42)

print("Visualizing class imbalance and transaction distribution to understand data characteristics.")

plt.figure(figsize=(6,4))
sns.countplot(x='Class', hue='Class', data=eda_df, palette="viridis", legend=False)
plt.title("Class Distribution (Sample 5000)")
plt.show()

plt.figure(figsize=(6,4))
sns.histplot(eda_df['Amount'], bins=50, kde=True, color='blue')
plt.title("Transaction Amount Distribution (Sample 5000)")
plt.show()

In [None]:
# === Correlation Heatmap ===

In [None]:
# Correlation matrix

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(eda_df.corr(numeric_only=True), cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap (Sample 5000)")
plt.show()

In [None]:
# 1. Top Correlated Features with Fraud (Correlation of all features with the target)

In [None]:
# Top Correlated Features with Fraud
corr_with_class = df.corr(numeric_only=True)['Class'].sort_values(ascending=False)

# Display top 5 positively and negatively correlated features in a single table
top_corr = pd.concat([corr_with_class.head(5), corr_with_class.tail(5)])
display(top_corr.to_frame(name='Correlation with Class'))

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(
    x=corr_with_class.head(10).values,
    y=corr_with_class.head(10).index,
    hue=corr_with_class.head(10).index,  # assign y to hue
    dodge=False,
    palette="magma",
    legend=False
)
plt.title("Top 10 Features Positively Correlated with Fraud")
plt.xlabel("Correlation with Class")
plt.show()

In [None]:
# 2. Distribution Differences between Fraud / Non-Fraud (Select top 5 correlated features for visualization)

In [None]:
top_features = corr_with_class.head(6).index.drop('Class')

In [None]:
for feat in top_features:
    plt.figure(figsize=(6,4))
    sns.kdeplot(df[df['Class']==0][feat], label='Non-Fraud', fill=True)
    sns.kdeplot(df[df['Class']==1][feat], label='Fraud', fill=True)
    plt.title(f"Distribution of {feat} for Fraud vs Non-Fraud")
    plt.xlabel(feat)
    plt.ylabel("Density")
    plt.legend()
    plt.show()

In [None]:
# 3. Feature Importance Beyond Correlation (Random Forest)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
X = df.drop(columns=['Class'])
y = df['Class']

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X, y)

In [None]:
# Get importances
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(
    x=importances.head(15).values,
    y=importances.head(15).index,
    hue=importances.head(15).index,
    dodge=False,
    palette="tab20",
    legend=False
)
plt.title("Top 15 Feature Importances (Random Forest)")
plt.xlabel("Importance")

# Add percentage labels on bars
for i, v in enumerate(importances.head(15)):
    plt.text(v + 0.001, i, f"{v:.2%}", color='black', va='center')

plt.show()

In [None]:
# === Feature Scaling & SMOTE === (limit to 50k samples)

In [None]:
X = df.drop(columns=['Class'])
y = df['Class']

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Apply SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
from collections import Counter
cnt = Counter(y)
minority, majority = min(cnt, key=cnt.get), max(cnt, key=cnt.get)

if cnt[minority] == cnt[majority]:
    print("Classes already balanced, skipping SMOTE.")
    X_resampled, y_resampled = X_scaled, y
else:
    smote = SMOTE(random_state=42, sampling_strategy=1.0)
    X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

In [None]:
print(y_resampled.value_counts(normalize=True))

In [None]:
# === Train-Test Split ===

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
#---Creating a smaller subset for KNN (10% of training data)---
X_train_knn, _, y_train_knn, _ = train_test_split(X_train, y_train, test_size=0.9, random_state=42, stratify=y_train)

In [None]:
# === Model Training (Single Pass) ===

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Decision Tree": DecisionTreeClassifier(max_depth=5, min_samples_split=20),
    "Random Forest": RandomForestClassifier(n_estimators=20, n_jobs=-1, max_depth=10),
    "XGBoost": XGBClassifier(eval_metric='logloss', n_estimators=50, tree_method='hist'),
    "KNN": KNeighborsClassifier(n_jobs=-1, n_neighbors=5)
}

In [None]:
print("Models to train:", list(models.keys()))

In [None]:
trained_models = {}

In [None]:
results_list = []

In [None]:
# === Train models & store metrics ===

In [None]:
import time

In [None]:
for name, model in models.items():
    print(f"\n🔹 Training {name}...")
    start = time.time()

    if name == "KNN":
        model.fit(X_train_knn, y_train_knn)
    else:
        model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    trained_models[name] = model
    end = time.time()

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Save results
    results_list.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1
    })
    print(f"✅ {name} done in {end - start:.2f} sec | F1 = {f1:.4f}")

In [None]:
# results dataframe

In [None]:
results_df_models = pd.DataFrame(results_list).sort_values(by="F1", ascending=False).reset_index(drop=True)

In [None]:
print("\n=== Model Comparison (sorted by F1) ===")
display(results_df_models.style.background_gradient(cmap='viridis'))

In [None]:
# === Confusion Matrix (Random Forest) ===

In [None]:
rf = trained_models["Random Forest"]
cm = confusion_matrix(y_test, rf.predict(X_test))
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Random Forest Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# === SHAP Explainability (Sample: only 200 rows) ===

In [None]:
plt.title("SHAP Feature Impact on XGBoost Predictions (Top 200 samples)")
xgb = trained_models["XGBoost"]
explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(X_test[:200])
shap.summary_plot(shap_values, X_test[:200], feature_names=df.drop(columns=['Class']).columns)

In [None]:
# === Hyperparameter Tuning (Tuning key parameters to improve F1 while keeping runtime reasonable - subset used for speed)===

In [None]:
# 1. For KNN

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [None]:
print("\nTuning KNN with small grid (faster)...")
param_grid = {
    "n_neighbors": [3, 5, 7],
    "weights": ["uniform", "distance"]
}

In [None]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [None]:
grid = GridSearchCV(
    KNeighborsClassifier(),
    param_grid,
    scoring="f1",
    cv=cv,
    n_jobs=-1,
    verbose=1
)

In [None]:
grid.fit(X_train[:10000], y_train[:10000])
best_model = grid.best_estimator_

print("\nBest Params:", grid.best_params_)
print("Best CV F1:", grid.best_score_)

In [None]:
y_pred = best_model.predict(X_test)

In [None]:
print("\n=== Tuned KNN on Test Set ===")
print(classification_report(y_test, y_pred, digits=4))

In [None]:
# 2. For Random Forest

In [None]:
rf_param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10],
    'min_samples_split': [10, 20]
}
rf_cv = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=rf_param_grid,
    scoring='f1',
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    n_jobs=-1,
    verbose=1
)
rf_cv.fit(X_train[:10000], y_train[:10000])

print("Random Forest Best Params:", rf_cv.best_params_)
print("Best CV F1:", rf_cv.best_score_)

rf_best = rf_cv.best_estimator_ #(Get best model)

In [None]:
# 3. XG Boost

In [None]:
xgb_param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.3],
    'subsample': [0.8, 1.0]
}
xgb_cv = GridSearchCV(
    estimator=XGBClassifier(eval_metric='logloss', tree_method='hist', random_state=42, n_jobs=-1),
    param_grid=xgb_param_grid,
    scoring='f1',
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    n_jobs=-1,
    verbose=1
)
xgb_cv.fit(X_train[:10000], y_train[:10000])

print("XGBoost Best Params:", xgb_cv.best_params_)
print("Best CV F1:", xgb_cv.best_score_)

xgb_best = xgb_cv.best_estimator_ #(Get best model)

In [None]:
# List to hold tuning results

In [None]:
tuning_results = []

# KNN
tuning_results.append({
    "Model": "KNN",
    "Best Params": grid.best_params_,
    "Best CV F1": grid.best_score_
})

# Random Forest
tuning_results.append({
    "Model": "Random Forest",
    "Best Params": rf_cv.best_params_,
    "Best CV F1": rf_cv.best_score_
})

# XGBoost
tuning_results.append({
    "Model": "XGBoost",
    "Best Params": xgb_cv.best_params_,
    "Best CV F1": xgb_cv.best_score_
})

In [None]:
# Create summary table
tuning_summary_df = pd.DataFrame(tuning_results)

# Display with gradient for visual impact
display(tuning_summary_df.style.background_gradient(cmap='viridis'))

In [None]:
# === Probability for fraud class ===

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

In [None]:
y_proba = best_model.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test, y_proba)

In [None]:
plt.figure(figsize=(8,6))

for name, model in trained_models.items():
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):  # (Optional) fallback for models like SVM
        y_proba = model.decision_function(X_test)
    else:
        continue  # Skip models without probability scores

    auc = roc_auc_score(y_test, y_proba)
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {auc:.4f})")

# Random baseline
plt.plot([0, 1], [0, 1], "--", color="gray")

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves for All Models")
plt.legend()
plt.show()

In [None]:
# === Save Final Model ===

In [None]:
feature_names = df.drop(columns=["Class"]).columns.tolist()

In [None]:
artifact = {
    "scaler": scaler,
    "model": best_model,
    "feature_names": feature_names,
    "target_name": "Class"
}

In [None]:
artifact_path = '/content/drive/MyDrive/Python Project/artifacts/fraud_detection_artifacts.pkl'

In [None]:
joblib.dump(artifact, "fraud_detection_artifacts.pkl", compress=3)

In [None]:
print("Saved: /content/drive/MyDrive/Python Project/artifacts/fraud_detection_artifacts.pkl")

In [None]:
# === Load & Predict Example ===

In [None]:
loaded = joblib.load(artifact_path)

In [None]:
# === Function to test random samples ===

In [None]:
def test_random_samples(df, n=10):
    sample_df = df.sample(n=n, random_state=42)   # random rows
    X_new = sample_df[feature_names].copy()
    y_true = sample_df["Class"].values

    # Scale features
    X_new_scaled = loaded["scaler"].transform(X_new)

    # Predictions + probabilities
    preds = loaded["model"].predict(X_new_scaled)
    proba = loaded["model"].predict_proba(X_new_scaled)

    # Build results DataFrame
    results_df = pd.DataFrame({
        "True_Label": y_true,
        "Prediction": preds,
        "Prob_0": proba[:, 0],
        "Prob_1": proba[:, 1]
    }, index=sample_df.index)

    print(f"\n=== Predictions for {n} Random Samples ===")
    print(results_df)

    return results_df, y_true, preds
results_df, y_true, preds = test_random_samples(df, n=10)

In [None]:
cm = confusion_matrix(y_true, preds)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=["Non-Fraud (0)", "Fraud (1)"],
            yticklabels=["Non-Fraud (0)", "Fraud (1)"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title(f"Confusion Matrix for {len(y_true)} Random Samples")
plt.show()

In [None]:
# === Model Performance Discussion ===

In [None]:
# 1. Summary of results

In [None]:
print("\n=== Model Metrics Summary ===")
print(results_df)

In [None]:
best_model_name = results_df_models.loc[0, "Model"]
best_model = trained_models[best_model_name]
print(f"\nBest performing model (by F1 score): {best_model_name}")

In [None]:
# 2. Why XGBoost perform better

In [None]:
performance_discussion = """
XGBoost outperforms simpler models like Logistic Regression or Decision Tree
because it is a gradient boosting ensemble method:

1. XGBoost iteratively builds trees, correcting errors from previous iterations,
   optimizing for fraud detection with high imbalance.
2. It captures complex, non-linear relationships between features and fraud occurrence,
   which simpler models may miss.
3. Regularization in XGBoost helps prevent overfitting while maintaining high recall.
"""
print(performance_discussion)

In [None]:
# 3. Precision vs Recall Trade-Off

In [None]:
precision_recall_discussion = """
In fraud detection, recall is critical because we want to identify as many fraudulent transactions
as possible (minimize False Negatives). Missing a fraud can have high financial risk.

Precision measures how many predicted frauds are actually fraud.
A low precision leads to many False Positives, which can annoy customers or waste resources.

The goal is to maximize recall while keeping precision reasonably high, balancing business cost vs detection rate.
XGBoost provides a good balance, with higher recall than simpler models, making it suitable for real-world fraud detection.
"""
print(precision_recall_discussion)

In [None]:
# 4. Visualizing Precision-Recall vs Decision Threshold for XGBoost

In [None]:
from sklearn.metrics import precision_recall_curve

In [None]:
y_proba = xgb_best.predict_proba(X_test)[:,1]  # Use XGBoost probabilities
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)

plt.figure(figsize=(7,5))
plt.plot(thresholds, precision[:-1], label='Precision', color='blue')
plt.plot(thresholds, recall[:-1], label='Recall', color='red')

# Vertical line at default threshold
plt.axvline(x=0.5, color='gray', linestyle='--', label='Default Threshold 0.5')

# Find nearest threshold index for 0.5
idx = np.argmin(np.abs(thresholds - 0.5))
plt.scatter(0.5, precision[idx], color='blue')
plt.scatter(0.5, recall[idx], color='red')
plt.text(0.52, precision[idx], f"{precision[idx]:.2f}", color='blue')
plt.text(0.52, recall[idx], f"{recall[idx]:.2f}", color='red')

plt.xlabel("Decision Threshold")
plt.ylabel("Score")
plt.title(f"Precision-Recall vs Threshold ({best_model_name})")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# === Business Insights ===

In [None]:
# 1. Calculate Detection Rate & False Positive Rate

In [None]:
y_pred = xgb_best.predict(X_test) # Using the best model (XGBoost)

# Total fraud transactions in test set
total_frauds = sum(y_test)
detected_frauds = sum((y_test == 1) & (y_pred == 1))

# Detection rate (recall)
detection_rate = detected_frauds / total_frauds * 100

# False positives
false_positives = sum((y_test == 0) & (y_pred == 1))
total_non_frauds = sum(y_test == 0)
false_positive_rate = false_positives / total_non_frauds * 100

print(f"Fraud Detection Rate (Recall): {detection_rate:.2f}%")
print(f"False Positive Rate: {false_positive_rate:.2f}%")

In [None]:
# 2. Estimate Real-World Impact

In [None]:
# Hypothetical average loss per fraud transaction
avg_loss_per_fraud = 100  # in USD

# Potential savings per month (if test set is representative)
potential_savings = detected_frauds * avg_loss_per_fraud

print(f"Estimated potential savings per month: ${potential_savings:,.0f}")

In [None]:
# 3. Summary Statement

In [None]:
business_insight = f"""
Using XGBoost, we can detect approximately **99.27% of fraudulent transactions**
while keeping the false positive rate **very low at 0.57%**.
Based on an average loss of $100 per fraudulent transaction,
this model could potentially save around **$5,655,800 per month**.
"""
print(business_insight)

In [None]:
# Reinstall numpy and scikit-learn to fix potential version conflicts
!pip install --upgrade numpy scikit-learn

In [None]:
# Install a specific version of numpy compatible with numba
!pip install numpy==1.26.4