In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from xgboost import XGBClassifier
import joblib
import os

# --- 1. Data Loading ---
print("Loading the dataset...")
# Create a 'data' folder at your project's root and place creditcard.csv there
data_path = os.path.join("..", "data", "creditcard.csv")
# If the script is in the 'model' folder, the path above goes up one level and enters 'data'
try:
    df = pd.read_csv(data_path)
except FileNotFoundError:
    print(f"Error: File not found at '{data_path}'.")
    print("Please download the dataset from Kaggle and place it in the 'data/' directory.")
    exit()

# --- 2. Quick Exploratory Data Analysis (EDA) and Preprocessing ---
print("\nAnalyzing class imbalance:")
class_distribution = df['Class'].value_counts()
print(class_distribution)
print(f"Fraud proportion: {class_distribution[1] / class_distribution[0] * 100:.4f}%")

# The 'Time' feature might not be directly useful without feature engineering (e.g., cyclical).
# The 'Amount' feature has a very different scale from the V* features, so we'll normalize it.
# We will use all V* features and 'Amount'.
print("\nPreparing data for training...")
X = df.drop(['Time', 'Class'], axis=1)
y = df['Class']

# --- 3. Train-Test Split ---
# Using stratify=y is CRUCIAL in imbalanced datasets to maintain class proportions
# in the training and test sets.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- 4. Data Normalization (Scaling) ---
# We use StandardScaler to normalize features to have mean 0 and standard deviation 1.
# It is vital to train the scaler ONLY with the training data to prevent data leakage.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # Use the same scaler on test data

print("Data normalized and ready.")

# --- 5. Imbalance Handling with XGBoost ---
# XGBoost has a native parameter, `scale_pos_weight`, which is perfect for this.
# It gives a higher weight to the minority class (frauds) during training.
# The recommendation is to use: (count of negative samples / count of positive samples)
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
print(f"\nCalculated scale_pos_weight: {scale_pos_weight:.2f}")

# --- 6. XGBoost Model Training ---
print("Starting XGBoost model training...")
model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='aucpr', # Area Under the Precision-Recall Curve, excellent for imbalanced data
    use_label_encoder=False,
    scale_pos_weight=scale_pos_weight,
    random_state=42
)

model.fit(X_train_scaled, y_train)
print("Training complete.")

# --- 7. Model Evaluation ---
print("\nEvaluating the model on the test set...")
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

print("\nClassification Report:")
# Note the recall for class 1. We want it to be as high as possible!
print(classification_report(y_test, y_pred, target_names=['Non-Fraud', 'Fraud']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(f"\nROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

# --- 8. Serialization (Saving the Model and Scaler) ---
# We will save the artifacts in an 'artifacts' folder to be used by the API.
output_dir = "artifacts"
os.makedirs(output_dir, exist_ok=True)

model_path = os.path.join(output_dir, "fraud_model.joblib")
scaler_path = os.path.join(output_dir, "scaler.joblib")

joblib.dump(model, model_path)
joblib.dump(scaler, scaler_path)

print(f"\nModel saved to: {model_path}")
print(f"Scaler saved to: {scaler_path}")