In [1]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", module="IPython")

root_dir = Path().absolute()
# Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
if root_dir.parts[-1:] == ('notebooks',):
    root_dir = Path(*root_dir.parts[:-1])
    sys.path.append(str(root_dir))
if root_dir.parts[-1:] == ('ccfraud',):
    root_dir = Path(*root_dir.parts[:-1])
    sys.path.append(str(root_dir))
root_dir = str(root_dir) 

print(f"Root dir: {root_dir}")

# Set the environment variables from the file <root_dir>/.env
from mlfs import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Root dir: /home/jdowling/Projects/mlfs-book
HopsworksSettings initialized!


In [2]:
import hopsworks
import pandas as pd

proj = hopsworks.login()
fs = proj.get_feature_store()
mr = proj.get_model_registry()

2026-01-05 12:20:05,059 INFO: Initializing external client
2026-01-05 12:20:05,061 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-05 12:20:05,863 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/120


In [3]:
merchant_fg = fs.get_feature_group("merchant_details", version=1)
account_fg = fs.get_feature_group("account_details", version=1)
bank_fg = fs.get_feature_group("bank_details", version=1)
card_fg = fs.get_feature_group("card_details", version=1)
cc_trans_aggs_fg = fs.get_feature_group("cc_trans_aggs_fg", version=1)
cc_trans_fg = fs.get_feature_group("cc_trans_fg", version=1)

In [4]:
subtree1 = cc_trans_aggs_fg.select_all()\
    .join(account_fg.select(['debt_end_prev_month']), on="account_id", join_type="inner")\
    .join(bank_fg.select(['credit_rating', 'days_since_bank_cr_changed', 'country']), on="bank_id", join_type="inner")

In [5]:
# df = subtree1.read()
# df

In [6]:
selection = cc_trans_fg.select_except(['t_id', 'cc_num', 'merchant_id', 'account_id', 'ip_address', 'ts'])\
    .join(merchant_fg.select_features(), prefix="merchant_", on="merchant_id")\
    .join(subtree1, on="cc_num")

2026-01-05 12:20:11,006 INFO: Using ['category', 'country', 'cnt_chrgeback_prev_day', 'cnt_chrgeback_prev_week', 'cnt_chrgeback_prev_month'] from feature group `merchant_details` as features for the query. To include primary key and event time use `select_all`.


In [7]:
# df = selection.read()
# df

In [None]:
fv = fs.get_or_create_feature_view(name="cc_fraud_fv", 
                                   version=1, 
                                   description="features for a credit card fraud prediction model",
                                   query=selection,
                                   labels=['is_fraud']
                                  )

In [None]:
test_start="2025-09-30 00:00"
X_train, X_test, y_train, y_test = fv.train_test_split(test_start=test_start)
X_train

In [None]:
X_test

In [None]:
y_train.value_counts()

In [None]:
# Feature Type Analysis
# Inspect feature types and missing values to understand preprocessing needs

print("=" * 80)
print("FEATURE DATA TYPES")
print("=" * 80)
print(X_train.dtypes)

print("\n" + "=" * 80)
print("MISSING VALUES SUMMARY")
print("=" * 80)
missing_counts = X_train.isnull().sum()
missing_pct = (X_train.isnull().sum() / len(X_train) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing %': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])

print("\n" + "=" * 80)
print("CATEGORICAL FEATURES IDENTIFIED")
print("=" * 80)
categorical_features = ['merchant_category', 'merchant_country', 'country', 'cc_num']
print(categorical_features)

In [None]:
# Verify Preprocessing Pipeline
# Check that all transformations were applied correctly

print("=" * 80)
print("PREPROCESSING PIPELINE VERIFICATION")
print("=" * 80)

# Check data types
print("Data types after preprocessing:")
print(X_train_processed.dtypes.value_counts())

# Check for any remaining missing values
print(f"\nMissing values: {X_train_processed.isnull().sum().sum()}")

# Display sample of transformed data
print("\nSample of transformed data:")
print(X_train_processed.head())

# Show pipeline structure
print("\nPreprocessing Pipeline Structure:")
print(preprocessor)

print("\nAll preprocessing complete!")
print(f"  ✓ Missing values imputed")
print(f"  ✓ Categorical features encoded")
print(f"  ✓ Pipeline can be saved and reused for inference")

In [None]:
# Generate Predictions
# Predict on the test set

y_pred = xgb_classifier.predict(X_test_processed)
y_pred_proba = xgb_classifier.predict_proba(X_test_processed)[:, 1]

print("=" * 80)
print("PREDICTION SUMMARY")
print("=" * 80)
print(f"Test set size:     {len(y_test):,}")
print(f"Predicted frauds:  {y_pred.sum():,} ({y_pred.sum()/len(y_test)*100:.2f}%)")
print(f"Actual frauds:     {y_test.sum():,} ({y_test.sum()/len(y_test)*100:.2f}%)")
print(f"\nPrediction probability range: [{y_pred_proba.min():.4f}, {y_pred_proba.max():.4f}]")

In [None]:
# Feature Importance
# Analyze which features are most important for fraud detection

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X_train_processed.columns,
    'importance': xgb_classifier.feature_importances_
}).sort_values('importance', ascending=False)

# Display top 15 features
print("=" * 80)
print("TOP 15 MOST IMPORTANT FEATURES")
print("=" * 80)
print(feature_importance.head(15).to_string(index=False))

# Visualize feature importance
fig, ax = plt.subplots(figsize=(10, 8))
top_features = feature_importance.head(15)
bars = ax.barh(top_features['feature'], top_features['importance'], color='steelblue')
ax.set_xlabel('Importance Score', fontsize=12)
ax.set_title('Top 15 Feature Importances - XGBoost Fraud Classifier', fontsize=14, fontweight='bold')
ax.invert_yaxis()

# Add value labels on bars
for i, (bar, val) in enumerate(zip(bars, top_features['importance'])):
    ax.text(val, bar.get_y() + bar.get_height()/2, f'{val:.4f}', 
            va='center', ha='left', fontsize=9, color='black')

plt.tight_layout()
feature_imp_fig = fig  # Store for saving later
plt.show()

print("\nInterpretation:")
print("Features with higher importance scores have more influence on fraud predictions.")
print("Geographic indicators, transaction patterns, and merchant history are key fraud signals.")

In [None]:
# Register Model in Hopsworks
# Upload model to Hopsworks Model Registry for versioning and deployment

print("=" * 80)
print("REGISTERING MODEL IN HOPSWORKS")
print("=" * 80)

# Format metrics for model registry (must be strings)
metrics_for_registry = {
    'roc_auc': f"{metrics_dict['roc_auc']:.4f}",
    'pr_auc': f"{metrics_dict['pr_auc']:.4f}",
    'precision': f"{metrics_dict['precision']:.4f}",
    'recall': f"{metrics_dict['recall']:.4f}",
    'f1_score': f"{metrics_dict['f1_score']:.4f}",
    'accuracy': f"{metrics_dict['accuracy']:.4f}",
    'scale_pos_weight': f"{scale_pos_weight:.2f}",
    'n_train_samples': str(len(y_train)),
    'n_fraud_train': str(n_positive),
    'imbalance_ratio': f"{scale_pos_weight:.2f}:1"
}

print("Model metadata:")
for key, value in metrics_for_registry.items():
    print(f"  {key:20s}: {value}")

# Create model in registry
cc_fraud_model = mr.python.create_model(
    name="cc_fraud_xgboost_model",
    metrics=metrics_for_registry,
    feature_view=fv,
    description="Credit Card Fraud Detection - XGBoost Binary Classifier with scale_pos_weight for class imbalance. "
                f"Trained on {len(y_train):,} samples with {n_positive} fraud cases. "
                f"Uses {len(X_train_processed.columns)} features after preprocessing."
)

# Upload model directory to registry
cc_fraud_model.save(model_dir)

print("\n" + "=" * 80)
print("MODEL REGISTRATION COMPLETE")
print("=" * 80)
print(f"Model name: cc_fraud_xgboost_model")
print(f"Version: {cc_fraud_model.version}")
print(f"\nExplore at: {cc_fraud_model._get_url()}")

In [None]:
# Save Model Artifacts
# Save all necessary files for model inference and reproducibility

import joblib

print("=" * 80)
print("SAVING MODEL ARTIFACTS")
print("=" * 80)

# Save trained model
model_path = model_dir + "/cc_fraud_xgboost.pkl"
joblib.dump(xgb_classifier, model_path)
print(f"✓ Model saved to: {model_path}")

# Save complete preprocessing pipeline (imputation + encoding)
preprocessor_path = model_dir + "/preprocessor.pkl"
joblib.dump(preprocessor, preprocessor_path)
print(f"✓ Preprocessor pipeline saved to: {preprocessor_path}")

# Save confusion matrix image
cm_fig.savefig(images_dir + "/confusion_matrix.png", dpi=100, bbox_inches='tight')
print(f"✓ Confusion matrix saved to: {images_dir}/confusion_matrix.png")

# Save feature importance image  
feature_imp_fig.savefig(images_dir + "/feature_importance.png", dpi=100, bbox_inches='tight')
print(f"✓ Feature importance saved to: {images_dir}/feature_importance.png")

# Save feature names for inference
feature_names_path = model_dir + "/feature_names.pkl"
joblib.dump(list(X_train_processed.columns), feature_names_path)
print(f"✓ Feature names saved to: {feature_names_path}")

# Save features to drop list
features_to_drop_path = model_dir + "/features_to_drop.pkl"
joblib.dump(features_to_drop, features_to_drop_path)
print(f"✓ Features to drop list saved to: {features_to_drop_path}")

print("\n" + "=" * 80)
print("ALL ARTIFACTS SAVED SUCCESSFULLY")
print("=" * 80)
print(f"Total files: 6")
print(f"  - Model: cc_fraud_xgboost.pkl")
print(f"  - Preprocessor: preprocessor.pkl (imputation + encoding)")
print(f"  - Feature schema: feature_names.pkl")
print(f"  - Feature selection: features_to_drop.pkl")
print(f"  - Visualizations: 2 PNG files")
print("\nFor inference, load: preprocessor.pkl → transform data → model.pkl → predict")

In [None]:
# Create Model Directory
# Setup directory structure for saving model artifacts

import os

model_dir = "cc_fraud_model"
images_dir = model_dir + "/images"

# Create directories
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
    print(f"Created directory: {model_dir}")
else:
    print(f"Directory already exists: {model_dir}")

if not os.path.exists(images_dir):
    os.mkdir(images_dir)
    print(f"Created directory: {images_dir}")
else:
    print(f"Directory already exists: {images_dir}")

print("\nModel artifacts will be saved to:", model_dir)

In [None]:
# Confusion Matrix
# Visualize model performance showing true/false positives and negatives

import seaborn as sns
import matplotlib.pyplot as plt

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Visualize confusion matrix
fig, ax = plt.subplots(figsize=(8, 6))
df_cm = pd.DataFrame(cm, 
                     index=['True Non-Fraud', 'True Fraud'],
                     columns=['Pred Non-Fraud', 'Pred Fraud'])

sns.heatmap(df_cm, annot=True, fmt='d', cmap='Blues', ax=ax, cbar_kws={'label': 'Count'})
ax.set_title('Confusion Matrix - Credit Card Fraud Detection', fontsize=14, fontweight='bold')
ax.set_ylabel('Actual', fontsize=12)
ax.set_xlabel('Predicted', fontsize=12)

plt.tight_layout()
cm_fig = fig  # Store for saving later
plt.show()

# Print confusion matrix breakdown
print("=" * 80)
print("CONFUSION MATRIX BREAKDOWN")
print("=" * 80)
print(f"True Negatives:  {cm[0,0]:5,} (correctly identified non-fraud)")
print(f"False Positives: {cm[0,1]:5,} (non-fraud flagged as fraud)")
print(f"False Negatives: {cm[1,0]:5,} (fraud missed - CRITICAL)")
print(f"True Positives:  {cm[1,1]:5,} (correctly identified fraud)")

print("\nKey Insights:")
if cm[1,0] > 0:
    print(f"  WARNING: {cm[1,0]} fraudulent transactions were missed!")
    print(f"  This represents {cm[1,0]/(cm[1,0]+cm[1,1])*100:.1f}% of all actual frauds.")
if cm[0,1] > 0:
    print(f"  {cm[0,1]} legitimate transactions were flagged as fraud (false alarms).")
    print(f"  This is {cm[0,1]/(cm[0,0]+cm[0,1])*100:.2f}% of all legitimate transactions.")

In [None]:
# Classification Metrics
# Comprehensive evaluation with metrics appropriate for imbalanced classification

from sklearn.metrics import (
    classification_report, confusion_matrix, 
    roc_auc_score, precision_recall_curve, auc,
    precision_score, recall_score, f1_score
)

# Classification report
print("=" * 80)
print("CLASSIFICATION REPORT")
print("=" * 80)
report_dict = classification_report(y_test, y_pred, 
                                   target_names=['Non-Fraud', 'Fraud'],
                                   output_dict=True)
print(classification_report(y_test, y_pred, target_names=['Non-Fraud', 'Fraud']))

# Calculate key metrics
roc_auc = roc_auc_score(y_test, y_pred_proba)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Precision-Recall AUC (more important than ROC-AUC for imbalanced data)
precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred_proba)
pr_auc = auc(recall_curve, precision_curve)

print("=" * 80)
print("KEY METRICS SUMMARY")
print("=" * 80)
print(f"ROC-AUC Score:        {roc_auc:.4f}")
print(f"PR-AUC Score:         {pr_auc:.4f}  <- More important for imbalanced data")
print(f"Precision (Fraud):    {precision:.4f}")
print(f"Recall (Fraud):       {recall:.4f}")
print(f"F1-Score (Fraud):     {f1:.4f}")

# Store metrics for model registry
metrics_dict = {
    'roc_auc': roc_auc,
    'pr_auc': pr_auc,
    'precision': precision,
    'recall': recall,
    'f1_score': f1,
    'accuracy': report_dict['accuracy']
}

print("\nInterpretation:")
print(f"  - Precision: {precision*100:.1f}% of predicted frauds are actually fraudulent")
print(f"  - Recall: {recall*100:.1f}% of actual frauds were detected")
print(f"  - PR-AUC: {pr_auc:.4f} measures precision-recall tradeoff (higher is better)")

In [None]:
# Train XGBoost Model
# Configure XGBoost for imbalanced binary classification

import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Configure XGBoost with scale_pos_weight for class imbalance
xgb_classifier = xgb.XGBClassifier(
    scale_pos_weight=scale_pos_weight,  # Handle class imbalance
    max_depth=6,                        # Control overfitting
    learning_rate=0.1,                  # Standard learning rate
    n_estimators=100,                   # Number of boosting rounds
    eval_metric='aucpr',                # PR-AUC: best metric for imbalanced data
    early_stopping_rounds=10,           # Stop if no improvement for 10 rounds
    random_state=42,                    # Reproducibility
    use_label_encoder=False,            # Avoid deprecation warning
    enable_categorical=False            # We pre-encoded categoricals
)

# Create evaluation set for early stopping
eval_set = [(X_train_processed, y_train), (X_test_processed, y_test)]

print("=" * 80)
print("TRAINING XGBOOST MODEL")
print("=" * 80)
print("Model configuration:")
print(f"  scale_pos_weight:      {scale_pos_weight:.2f}")
print(f"  max_depth:             {xgb_classifier.max_depth}")
print(f"  learning_rate:         {xgb_classifier.learning_rate}")
print(f"  n_estimators:          {xgb_classifier.n_estimators}")
print(f"  eval_metric:           {xgb_classifier.eval_metric}")
print(f"  early_stopping_rounds: 10")
print("\nTraining in progress...")

# Train the model
xgb_classifier.fit(
    X_train_processed, 
    y_train.values.ravel(),
    eval_set=eval_set,
    verbose=False
)

print("\nTraining complete!")
print(f"Best iteration: {xgb_classifier.best_iteration}")
print(f"Best score (AUCPR): {xgb_classifier.best_score:.4f}")

In [None]:
# Calculate scale_pos_weight
# This is the key parameter for handling class imbalance in XGBoost

n_negative = (y_train == False).sum()
n_positive = (y_train == True).sum()
scale_pos_weight = n_negative / n_positive

print("=" * 80)
print("CLASS IMBALANCE ANALYSIS")
print("=" * 80)
print(f"Negative samples (non-fraud): {n_negative:,}")
print(f"Positive samples (fraud):     {n_positive:,}")
print(f"Imbalance ratio:              {scale_pos_weight:.2f}:1")
print(f"\nscale_pos_weight parameter:   {scale_pos_weight:.2f}")
print("\nThis parameter tells XGBoost to give ~{:.0f}x more weight to fraud cases".format(scale_pos_weight))
print("during training to compensate for the severe class imbalance.")

In [None]:
# Create Preprocessing Pipeline
# Use sklearn Pipeline for imputation and encoding (production-ready for inference)

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
import numpy as np

print("=" * 80)
print("CREATING PREPROCESSING PIPELINE")
print("=" * 80)
print("Missing values before preprocessing:")
print(X_train_processed.isnull().sum()[X_train_processed.isnull().sum() > 0])

# Identify numeric and categorical columns
categorical_features = ['merchant_category', 'merchant_country', 'country', 'prev_ip_transaction']
numeric_features = [col for col in X_train_processed.columns if col not in categorical_features]

print(f"\nNumeric features ({len(numeric_features)}): {numeric_features[:5]}...")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")

# Create preprocessing pipeline with both imputation and encoding
# Numeric pipeline: just impute missing values
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

# Categorical pipeline: impute missing values then encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='UNKNOWN')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

# Combine into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    verbose_feature_names_out=False
)

# Fit and transform the data
X_train_transformed = preprocessor.fit_transform(X_train_processed)
X_test_transformed = preprocessor.transform(X_test_processed)

# Get feature names after transformation
feature_names_out = preprocessor.get_feature_names_out()

# Convert back to DataFrame
X_train_processed = pd.DataFrame(X_train_transformed, columns=feature_names_out, index=X_train_processed.index)
X_test_processed = pd.DataFrame(X_test_transformed, columns=feature_names_out, index=X_test_processed.index)

print("\nAfter preprocessing pipeline:")
print(f"  Missing values: {X_train_processed.isnull().sum().sum()}")
print(f"  Train shape: {X_train_processed.shape}")
print(f"  Test shape: {X_test_processed.shape}")
print("\nPreprocessing pipeline (imputation + encoding) ready for inference!")

In [None]:
# Feature Selection
# Drop features that should NOT be used for model training

features_to_drop = [
    'event_time',           # Timestamp - would cause data leakage
    'prev_ts_transaction',  # Timestamp - not useful for tree models
    'cc_num'                # High cardinality ID - would overfit
]

X_train_processed = X_train.drop(columns=features_to_drop)
X_test_processed = X_test.drop(columns=features_to_drop)

print("=" * 80)
print("FEATURE SELECTION")
print("=" * 80)
print(f"Original features: {X_train.shape[1]}")
print(f"After dropping: {X_train_processed.shape[1]}")
print(f"\nDropped features: {features_to_drop}")
print(f"\nRemaining features ({len(X_train_processed.columns)}):")
for i, feat in enumerate(X_train_processed.columns, 1):
    print(f"  {i:2d}. {feat}")