In [1]:
# train_pipeline.py

import pandas as pd
import joblib

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline 

from xgboost import XGBClassifier


In [2]:
df = pd.read_csv("data.csv")

TARGET = "Fraud_Label"


In [3]:
CAT_COLS = ['Transaction_Type', 'Device_Type',
       'Location', 'Merchant_Category', 'Authentication_Method']

DROP_COLS = ["Transaction_ID", "User_ID"]  # not used for training


In [4]:
CAT_COLS

['Transaction_Type',
 'Device_Type',
 'Location',
 'Merchant_Category',
 'Authentication_Method']

In [5]:
class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.encoders = {}

    def fit(self, X, y=None):
        for col in self.columns:
            le = LabelEncoder()
            le.fit(X[col].astype(str))
            self.encoders[col] = le
        return self

    def transform(self, X):
        X = X.copy()
        for col, le in self.encoders.items():
            X[col] = le.transform(X[col].astype(str))
        return X


In [6]:
X = df.drop(columns=[TARGET] + DROP_COLS)
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=0
)


In [7]:
train_pipeline = Pipeline(steps=[
    ("label_encode", MultiColumnLabelEncoder(CAT_COLS)),
    ("scaler", StandardScaler()),
    ("smote", SMOTE(random_state=42)),
    ("model", XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]),
        eval_metric="logloss",
        random_state=42
    ))
])



In [8]:
train_pipeline.fit(X_train, y_train)

y_pred = train_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.85      1.00      0.92      6787
           1       1.00      0.61      0.76      3213

    accuracy                           0.88     10000
   macro avg       0.92      0.81      0.84     10000
weighted avg       0.89      0.88      0.87     10000



In [9]:
inference_pipeline = Pipeline(steps=[
    ("label_encode", train_pipeline.named_steps["label_encode"]),
    ("scaler", train_pipeline.named_steps["scaler"]),
    ("model", train_pipeline.named_steps["model"])
])

joblib.dump(inference_pipeline, "model.pkl")
print("model.pkl")


model.pkl


In [10]:
# After training
y_test_pred = inference_pipeline.predict(X_test)
y_test_prob = inference_pipeline.predict_proba(X_test)[:, 1]

from sklearn.metrics import classification_report, roc_auc_score

print(classification_report(y_test, y_test_pred))
print("ROC AUC:", roc_auc_score(y_test, y_test_prob))


              precision    recall  f1-score   support

           0       0.85      1.00      0.92      6787
           1       1.00      0.61      0.76      3213

    accuracy                           0.88     10000
   macro avg       0.92      0.81      0.84     10000
weighted avg       0.89      0.88      0.87     10000

ROC AUC: 0.8044611063487982


In [13]:
y_test_prob

array([0.3373156 , 0.27864915, 0.32808462, ..., 0.30769268, 0.18739198,
       0.20740213], dtype=float32)