In [None]:
import sys
from pathlib import Path

root_dir = Path().absolute()
# Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
if root_dir.parts[-1:] == ('notebooks',):
    root_dir = Path(*root_dir.parts[:-1])
    sys.path.append(str(root_dir))
if root_dir.parts[-1:] == ('ccfraud',):
    root_dir = Path(*root_dir.parts[:-1])
    sys.path.append(str(root_dir))
root_dir = str(root_dir) 

print(f"Root dir: {root_dir}")

# Set the environment variables from the file <root_dir>/.env
from mlfs import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

In [None]:
import hopsworks
import pandas as pd
import shutil

proj = hopsworks.login()
fs = proj.get_feature_store()
mr = proj.get_model_registry()

env_api = proj.get_environment_api()
ds_api = proj.get_dataset_api()

In [None]:
merchant_fg = fs.get_feature_group("merchant_details", version=1)
account_fg = fs.get_feature_group("account_details", version=1)
bank_fg = fs.get_feature_group("bank_details", version=1)
card_fg = fs.get_feature_group("card_details", version=1)
cc_trans_aggs_fg = fs.get_feature_group("cc_trans_aggs_fg", version=1)
cc_trans_fg = fs.get_feature_group("cc_trans_fg", version=1)

In [None]:
subtree1 = cc_trans_aggs_fg.select_except(['t_id','cc_num','account_id','bank_id','event_time'])\
    .join(account_fg.select(['debt_end_prev_month']), on="account_id")\
    .join(bank_fg.select(['credit_rating', 'days_since_bank_cr_changed', 'country']), prefix="bank_", on="bank_id")

In [None]:
# df = subtree1.read()
# df.isna().sum().sort_values(ascending=False)
# df

In [None]:
selection = cc_trans_fg.select_except(['t_id', 'cc_num', 'merchant_id', 'account_id', 'ip_address', 'ts'])\
    .join(merchant_fg.select_features(), on="merchant_id", join_type="inner")\
    .join(subtree1, on="cc_num")
# , prefix="aggs_"
# , prefix="merchant_"

In [None]:
# df = selection.read()
# df.isna().sum().sort_values(ascending=False)

In [None]:
fv = fs.get_or_create_feature_view(name="cc_fraud_fv", 
                                   version=1, 
                                   description="features for a credit card fraud prediction model",
                                   query=selection,
                                   labels=['is_fraud'],
                                   inference_helper_columns=['prev_card_present','prev_ip_address','prev_ts']
                                  )

In [None]:
# Parameters
test_start = "2026-01-27 00:00"


In [None]:
X_train, X_test, y_train, y_test = fv.train_test_split(test_start=test_start)
X_train

In [None]:
X_test

In [None]:
y_train.value_counts()

In [None]:
# Data is ready for training
# The unified Pipeline will handle preprocessing internally
# No need to transform data separately - just pass raw X_train/X_test to the pipeline

print(f"Training data: {X_train.shape[0]:,} samples, {X_train.shape[1]} features")
print(f"Test data: {X_test.shape[0]:,} samples, {X_test.shape[1]} features")
print("\nRaw features will be preprocessed by the unified Pipeline during fit/predict.")

In [None]:
# Define Preprocessing Components for Pipeline
# These will be combined with the model into a unified sklearn Pipeline

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
import numpy as np

print("DEFINING PREPROCESSING COMPONENTS")
print("Missing values in training data:")
print(X_train.isnull().sum()[X_train.isnull().sum() > 0])

# Identify numeric and categorical columns
categorical_features = ['category', 'country', 'bank_country']
numeric_features = [col for col in X_train.columns if col not in categorical_features]

print(f"\nNumeric features ({len(numeric_features)}): {numeric_features[:5]}...")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")

# Create preprocessing pipeline with both imputation and encoding
# Numeric pipeline: just impute missing values
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

# Categorical pipeline: impute missing values then encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='UNKNOWN')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

# Combine into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    verbose_feature_names_out=False
)

print("\nPreprocessing components defined (will be fitted as part of unified Pipeline)")
print("  - Numeric: median imputation")
print("  - Categorical: constant imputation ('UNKNOWN') + ordinal encoding")

In [None]:
# Preview Preprocessing Pipeline Structure
# This shows what the preprocessing will do when fitted as part of the unified Pipeline

print("=" * 80)
print("PREPROCESSING PIPELINE STRUCTURE")
print("=" * 80)

print("\nPreprocessing Pipeline Structure:")
print(preprocessor)

print("\nPreprocessing steps:")
print("  1. Numeric features: Impute missing values with median")
print("  2. Categorical features: Impute missing with 'UNKNOWN', then ordinal encode")
print("\nThis preprocessor will be combined with XGBoost into a single Pipeline")
print("and fitted together in the next step.")

In [None]:
# Calculate scale_pos_weight
# This is the key parameter for handling class imbalance in XGBoost

n_negative = (y_train["is_fraud"] == False).sum()
n_positive = (y_train["is_fraud"] == True).sum()
scale_pos_weight = n_negative / n_positive

print("=" * 80)
print("CLASS IMBALANCE ANALYSIS")
print("=" * 80)
print(f"Negative samples (non-fraud): {n_negative:,}")
print(f"Positive samples (fraud):     {n_positive:,}")
print(f"Imbalance ratio:              {scale_pos_weight:.2f}:1")
print(f"\nscale_pos_weight parameter:   {scale_pos_weight:.2f}")
print("\nThis parameter tells XGBoost to give ~{:.0f}x more weight to fraud cases".format(scale_pos_weight))
print("during training to compensate for the severe class imbalance.")

In [None]:
# Train Unified sklearn Pipeline (Preprocessor + XGBoost)
# Single Pipeline for preprocessing and prediction

import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

print("=" * 80)
print("TRAINING UNIFIED SKLEARN PIPELINE")
print("=" * 80)

# Create unified Pipeline combining preprocessing and model
# Note: Early stopping is removed because sklearn Pipeline doesn't support eval_set
# We use a fixed n_estimators instead (100 is a good default)
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', xgb.XGBClassifier(
        scale_pos_weight=scale_pos_weight,  # Handle class imbalance
        max_depth=6,                        # Control overfitting
        learning_rate=0.1,                  # Standard learning rate
        n_estimators=100,                   # Fixed number of boosting rounds
        eval_metric='aucpr',                # PR-AUC: best metric for imbalanced data
        random_state=42,                    # Reproducibility
        use_label_encoder=False,            # Avoid deprecation warning
        enable_categorical=False            # We pre-encoded categoricals
    ))
])

print("Pipeline configuration:")
print(f"  Preprocessor: ColumnTransformer (imputation + encoding)")
print(f"  Model: XGBClassifier")
print(f"    - scale_pos_weight: {scale_pos_weight:.2f}")
print(f"    - max_depth:        6")
print(f"    - learning_rate:    0.1")
print(f"    - n_estimators:     100 (fixed, no early stopping)")
print(f"    - eval_metric:      aucpr")

print("\nTraining unified pipeline on RAW data...")

# Train the entire pipeline on RAW (untransformed) data
# The pipeline handles preprocessing internally
full_pipeline.fit(X_train, y_train.values.ravel())

print("\nTraining complete!")
print("Pipeline steps: preprocessor -> model")
print("Ready for inference with raw feature data.")

In [None]:
# Feature Importance
# Analyze which features are most important for fraud detection
import matplotlib.pyplot as plt

# Access model and preprocessor from the pipeline
xgb_model = full_pipeline.named_steps['model']
fitted_preprocessor = full_pipeline.named_steps['preprocessor']

# Get feature names after preprocessing transformation
feature_names = fitted_preprocessor.get_feature_names_out()

# Get feature importance from the model
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

# Display top 15 features
print("=" * 80)
print("TOP 15 MOST IMPORTANT FEATURES")
print("=" * 80)
print(feature_importance.head(15).to_string(index=False))

# Visualize feature importance
fig, ax = plt.subplots(figsize=(10, 8))
top_features = feature_importance.head(15)
bars = ax.barh(top_features['feature'], top_features['importance'], color='steelblue')
ax.set_xlabel('Importance Score', fontsize=12)
ax.set_title('Top 15 Feature Importances - XGBoost Fraud Classifier', fontsize=14, fontweight='bold')
ax.invert_yaxis()

# Add value labels on bars
for i, (bar, val) in enumerate(zip(bars, top_features['importance'])):
    ax.text(val, bar.get_y() + bar.get_height()/2, f'{val:.4f}', 
            va='center', ha='left', fontsize=9, color='black')

plt.tight_layout()
feature_imp_fig = fig  # Store for saving later
plt.show()

print("\nInterpretation:")
print("Features with higher importance scores have more influence on fraud predictions.")
print("Geographic indicators, transaction patterns, and merchant history are key fraud signals.")

In [None]:
# Confusion Matrix
# Visualize model performance showing true/false positives and negatives

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Predictions on the test set using the full pipeline (on RAW data)
# The pipeline handles preprocessing internally
y_pred = full_pipeline.predict(X_test)

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Visualize confusion matrix
fig, ax = plt.subplots(figsize=(8, 6))
df_cm = pd.DataFrame(cm, 
                     index=['True Non-Fraud', 'True Fraud'],
                     columns=['Pred Non-Fraud', 'Pred Fraud'])

sns.heatmap(df_cm, annot=True, fmt='d', cmap='Blues', ax=ax, cbar_kws={'label': 'Count'})
ax.set_title('Confusion Matrix - Credit Card Fraud Detection', fontsize=14, fontweight='bold')
ax.set_ylabel('Actual', fontsize=12)
ax.set_xlabel('Predicted', fontsize=12)

plt.tight_layout()
cm_fig = fig  # Store for saving later
plt.show()

# Print confusion matrix breakdown
print("=" * 80)
print("CONFUSION MATRIX BREAKDOWN")
print("=" * 80)
print(f"True Negatives:  {cm[0,0]:5,} (correctly identified non-fraud)")
print(f"False Positives: {cm[0,1]:5,} (non-fraud flagged as fraud)")
print(f"False Negatives: {cm[1,0]:5,} (fraud missed - CRITICAL)")
print(f"True Positives:  {cm[1,1]:5,} (correctly identified fraud)")

print("\nKey Insights:")
if cm[1,0] > 0:
    print(f"  WARNING: {cm[1,0]} fraudulent transactions were missed!")
    print(f"  This represents {cm[1,0]/(cm[1,0]+cm[1,1])*100:.1f}% of all actual frauds.")
if cm[0,1] > 0:
    print(f"  {cm[0,1]} legitimate transactions were flagged as fraud (false alarms).")
    print(f"  This is {cm[0,1]/(cm[0,0]+cm[0,1])*100:.2f}% of all legitimate transactions.")

In [None]:
# Classification Metrics
# Comprehensive evaluation with metrics appropriate for imbalanced classification

from sklearn.metrics import (
    classification_report, confusion_matrix, 
    roc_auc_score, precision_recall_curve, auc,
    precision_score, recall_score, f1_score
)

# y_pred was already computed using full_pipeline.predict(X_test) in the confusion matrix cell

# Classification report
print("=" * 80)
print("CLASSIFICATION REPORT")
print("=" * 80)
report_dict = classification_report(y_test, y_pred, 
                                   target_names=['Non-Fraud', 'Fraud'],
                                   output_dict=True)
print(classification_report(y_test, y_pred, target_names=['Non-Fraud', 'Fraud']))

# Calculate key metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Precision-Recall AUC (more important than ROC-AUC for imbalanced data)
precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred)
pr_auc = auc(recall_curve, precision_curve)

print("=" * 80)
print("KEY METRICS SUMMARY")
print("=" * 80)
print(f"PR-AUC Score:         {pr_auc:.4f}  <- More important for imbalanced data")
print(f"Precision (Fraud):    {precision:.4f}")
print(f"Recall (Fraud):       {recall:.4f}")
print(f"F1-Score (Fraud):     {f1:.4f}")

# Store metrics for model registry
metrics_dict = {
    'pr_auc': pr_auc,
    'precision': precision,
    'recall': recall,
    'f1_score': f1,
    'accuracy': report_dict['accuracy']
}

print("\nInterpretation:")
print(f"  - Precision: {precision*100:.1f}% of predicted frauds are actually fraudulent")
print(f"  - Recall: {recall*100:.1f}% of actual frauds were detected")
print(f"  - PR-AUC: {pr_auc:.4f} measures precision-recall tradeoff (higher is better)")

In [None]:
# Create Model Directory
# Setup directory structure for saving model artifacts
import os

model_dir = "cc_fraud_model"
images_dir = model_dir + "/images"
os.makedirs(images_dir, exist_ok=True)

print("\nModel artifacts will be saved to:", model_dir)

In [None]:
# Save Model Artifacts
# Save the unified Pipeline (preprocessor + model) as a single file

import joblib

print("=" * 80)
print("SAVING MODEL ARTIFACTS")
print("=" * 80)

# Save the unified pipeline (preprocessor + XGBoost in one file)
pipeline_path = model_dir + "/cc_fraud_pipeline.pkl"
joblib.dump(full_pipeline, pipeline_path)
print(f"Unified pipeline saved to: {pipeline_path}")

# Save confusion matrix image
cm_fig.savefig(images_dir + "/confusion_matrix.png", dpi=100, bbox_inches='tight')
print(f"Confusion matrix saved to: {images_dir}/confusion_matrix.png")

# Save feature importance image  
feature_imp_fig.savefig(images_dir + "/feature_importance.png", dpi=100, bbox_inches='tight')
print(f"Feature importance saved to: {images_dir}/feature_importance.png")

print("\n" + "=" * 80)
print("ALL ARTIFACTS SAVED SUCCESSFULLY")
print("=" * 80)
print(f"Total files: 3")
print(f"  - Pipeline: cc_fraud_pipeline.pkl (preprocessor + model combined)")
print(f"  - Visualizations: 2 PNG files")
print("\nFor inference: load cc_fraud_pipeline.pkl -> call pipeline.predict(raw_features)")

In [None]:
# Add the predictor script to the model's directory
predictor_script="ccfraud-predictor.py"
src = Path(f"notebooks/{predictor_script}")
dst_dir = Path(model_dir)
try:
    shutil.copy(src, dst_dir / src.name)
except:
    src = Path(predictor_script)
    shutil.copy(src, dst_dir / src.name)

In [None]:
# Register Model in Hopsworks
# Upload unified pipeline to Hopsworks Model Registry for versioning and deployment

print("=" * 80)
print("REGISTERING MODEL IN HOPSWORKS")
print("=" * 80)

# Format metrics for model registry (must be strings)
metrics_for_registry = {
    'pr_auc': f"{metrics_dict['pr_auc']:.4f}",
    'precision': f"{metrics_dict['precision']:.4f}",
    'recall': f"{metrics_dict['recall']:.4f}",
    'f1_score': f"{metrics_dict['f1_score']:.4f}",
    'accuracy': f"{metrics_dict['accuracy']:.4f}",
    'scale_pos_weight': f"{scale_pos_weight:.2f}",
    'n_train_samples': str(len(y_train)),
    'n_fraud_train': str(n_positive),
    'imbalance_ratio': f"{scale_pos_weight:.2f}"
}

print("Model metadata:")
for key, value in metrics_for_registry.items():
    print(f"  {key:20s}: {value}")

model_name = "cc_fraud_xgboost_model"
# Create model in registry
cc_fraud_model = mr.python.create_model(
    name=model_name,
    metrics=metrics_for_registry,
    feature_view=fv,
    description="Credit Card Fraud Detection - Unified sklearn Pipeline with XGBoost. "
                "Single artifact contains preprocessor (imputation + encoding) and classifier. "
                f"Trained on {len(y_train):,} samples with {n_positive} fraud cases. "
                f"Uses {len(X_train.columns)} raw features, preprocessed internally by pipeline."
)

# Upload model directory to registry
cc_fraud_model.save(model_dir)

print("\n" + "=" * 80)
print("MODEL REGISTRATION COMPLETE")
print("=" * 80)
print(f"Model name: {model_name}")
print(f"Version: {cc_fraud_model.version}")
print(f"Artifact: cc_fraud_pipeline.pkl (unified preprocessor + model)")

In [None]:
ms = proj.get_model_serving()
best_model = mr.get_best_model(name=model_name, metric="f1_score", direction="max")

env_name = "ccfraud-inference-pipeline"

if not env_api.get_environment(env_name):
    env = env_api.create_environment(env_name, base_environment_name="torch-inference-pipeline")
    requirements_path = ds_api.upload(f"{root_dir}/ccfraud/requirements.txt", "Resources", overwrite=True)
    env.install_requirements(requirements_path, await_installation=True)

# If the model I trained is better than the existing model deployment, replace it with this one
if best_model.version == cc_fraud_model.version:
    print(f"This is the best model version at: {best_model.version_path}")
    predictor_path = os.path.join(best_model.version_path, f"Files/{predictor_script}")
    deployment_name = "ccfraud"
    try:
        deployment = ms.get_deployment(deployment_name)
        deployment.delete(force=True)
        print(f"Deleted deployment {deployment_name}")
    except:
        print("Deployment not running")
    deployment = best_model.deploy(name=deployment_name, 
                                   script_file=predictor_path, 
                                   environment=env_name
                                  )
    deployment.start(await_running=180)
    deployment_state = deployment.get_state().describe()
else:
    print("Not deploying this model, as its performance is worse than the existing deployment")