In [117]:
# train_pipeline.py

import pandas as pd
import joblib

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline 

from xgboost import XGBClassifier


In [142]:

df = pd.read_csv('data.csv')

In [113]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

df['Hour'] = df['Timestamp'].dt.hour
df['Day'] = df['Timestamp'].dt.day
df['Month'] = df['Timestamp'].dt.month
df['DayOfWeek'] = df['Timestamp'].dt.dayofweek

df = df.sort_values(['User_ID'])
df.drop(columns=['Timestamp'],inplace=True)

In [114]:
# # Transactions per user
# user_txn_count = df.groupby('User_ID')['Transaction_ID'].transform('count')
# df['user_transaction_count'] = user_txn_count

# # Average amount per user
# df['user_avg_amount'] = df.groupby('User_ID')['Transaction_Amount'].transform('mean')

# #standard deviation in transaction amount
# df['user_amt_std'] = (
#     df.groupby('User_ID')['Transaction_Amount'].transform('std')
#     .fillna(0)
# )

# # Time gap from previous transaction (seconds)
# df['time_gap_prev_txn'] = (
#     df.groupby('User_ID')['Timestamp']
#       .diff()
#       .dt.total_seconds()
#       .fillna(0)
# )

In [115]:
df.drop(columns=['Card_Type','Card_Age','Risk_Score','Is_Weekend','IP_Address_Flag'], inplace=True)
df.columns

Index(['Transaction_ID', 'User_ID', 'Transaction_Amount', 'Transaction_Type',
       'Account_Balance', 'Device_Type', 'Location', 'Merchant_Category',
       'Previous_Fraudulent_Activity', 'Daily_Transaction_Count',
       'Avg_Transaction_Amount_7d', 'Failed_Transaction_Count_7d',
       'Transaction_Distance', 'Authentication_Method', 'Fraud_Label', 'Hour',
       'Day', 'Month', 'DayOfWeek'],
      dtype='object')

In [116]:
df.to_csv("data.csv", index=False)

In [143]:
# Numerical
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Categorical
cat_cols = df.select_dtypes(include='object').columns
df[cat_cols] = df[cat_cols].fillna("Unknown")


In [134]:

le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])



In [121]:
# Numeric correlation only
corr_with_target = df.corr()['Fraud_Label']
corr_with_target


Transaction_ID                  0.010258
User_ID                        -0.004051
Transaction_Amount              0.001901
Transaction_Type               -0.004592
Account_Balance                -0.003153
Device_Type                     0.005368
Location                        0.004680
Merchant_Category               0.005734
Previous_Fraudulent_Activity   -0.000718
Daily_Transaction_Count        -0.007065
Avg_Transaction_Amount_7d       0.000703
Failed_Transaction_Count_7d     0.509871
Transaction_Distance           -0.000116
Authentication_Method          -0.000163
Fraud_Label                     1.000000
Hour                            0.005822
Day                             0.005262
Month                          -0.001213
DayOfWeek                       0.005272
Name: Fraud_Label, dtype: float64

In [122]:
df

Unnamed: 0,Transaction_ID,User_ID,Transaction_Amount,Transaction_Type,Account_Balance,Device_Type,Location,Merchant_Category,Previous_Fraudulent_Activity,Daily_Transaction_Count,Avg_Transaction_Amount_7d,Failed_Transaction_Count_7d,Transaction_Distance,Authentication_Method,Fraud_Label,Hour,Day,Month,DayOfWeek
0,28957,0,69.02,2,36077.57,2,0,2,0,6,225.91,4,2983.85,3,1,10,21,2,1
1,12926,0,39.41,1,70995.83,0,1,0,0,2,120.49,1,4851.37,2,0,6,3,12,6
2,20725,0,191.87,2,4201.83,0,4,1,0,7,357.91,2,3616.72,1,0,23,26,4,2
3,4092,0,383.60,1,53558.88,0,1,0,0,8,441.82,1,567.59,1,0,3,29,10,6
4,33467,0,64.78,2,25291.69,2,0,0,0,10,106.00,2,2042.18,3,1,20,1,9,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,7827,8962,184.21,2,87760.42,0,0,0,0,1,93.51,2,2970.61,1,0,11,23,2,3
49996,42484,8962,21.80,1,92348.22,2,2,3,0,2,276.15,1,2429.41,2,0,17,22,8,1
49997,1296,8962,49.53,1,81626.81,2,2,1,0,12,220.06,1,4727.77,1,0,4,4,11,5
49998,22611,8962,5.74,3,17164.18,0,3,0,0,12,343.24,4,111.45,1,1,3,21,1,5


In [135]:
X = df.drop(columns=['Fraud_Label','User_ID','Transaction_ID'])
y = df['Fraud_Label']


In [124]:

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [136]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

In [137]:

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.2,
    random_state=42,
    stratify=y_resampled
)


In [138]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    min_samples_split=10,
    class_weight='balanced',
    random_state=42
)

rf.fit(X_train, y_train)


In [139]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_rf))


              precision    recall  f1-score   support

           0       0.81      1.00      0.90      6787
           1       1.00      0.77      0.87      6787

    accuracy                           0.88     13574
   macro avg       0.91      0.88      0.88     13574
weighted avg       0.91      0.88      0.88     13574

ROC-AUC: 0.9182458191453605


In [140]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]),
    eval_metric='logloss',
    random_state=42
)

xgb.fit(X_train, y_train)


In [141]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred_xgb = xgb.predict(X_test)
y_prob_xgb = xgb.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred_xgb))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_xgb))


              precision    recall  f1-score   support

           0       0.85      1.00      0.92      6787
           1       1.00      0.82      0.90      6787

    accuracy                           0.91     13574
   macro avg       0.92      0.91      0.91     13574
weighted avg       0.92      0.91      0.91     13574

ROC-AUC: 0.9139063189233945


In [131]:
y_train.value_counts()

Fraud_Label
0    27146
1    27146
Name: count, dtype: int64

In [None]:
train_pipeline = ImbPipeline([
    ("user_features", UserFeatureEngineer()),
    ("preprocessing", preprocessor),

    ("model", xgb)
])

In [75]:
from sklearn.pipeline import Pipeline

In [None]:
TARGET = "Fraud_Label"

NUM_FEATURES = [
    "Transaction_Amount",
    "Account_Balance",
    "Daily_Transaction_Count",
    "Avg_Transaction_Amount_7d",
    "Failed_Transaction_Count_7d",
    "Transaction_Distance",
    "Hour",
    "Day",
    "Month",
    "DayOfWeek"
]

CAT_FEATURES = [
    "Transaction_Type",
    "Device_Type",
    "Merchant_Category",
    "Authentication_Method",
    "IP_Address_Flag"
]

X = df[NUM_FEATURES + CAT_FEATURES]
y = df[TARGET]


In [None]:
class UserFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, user_col="User_ID", txn_col="Transaction_ID", amount_col="Transaction_Amount", time_col="Timestamp"):
        self.user_col = user_col
        self.txn_col = txn_col
        self.amount_col = amount_col
        self.time_col = time_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        
        # Ensure Timestamp is datetime
        df[self.time_col] = pd.to_datetime(df[self.time_col])

        # Number of transactions per user
        df['user_transaction_count'] = df.groupby(self.user_col)[self.txn_col].transform('count')
        
        # Average transaction amount per user
        df['user_avg_amount'] = df.groupby(self.user_col)[self.amount_col].transform('mean')
        
        # Std of transaction amount per user
        df['user_amt_std'] = df.groupby(self.user_col)[self.amount_col].transform('std').fillna(0)
        
        # Time gap from previous transaction per user (seconds)
        df['time_gap_prev_txn'] = df.groupby(self.user_col)[self.time_col].diff().dt.total_seconds().fillna(0)
        
        # Drop Transaction_ID and User_ID
        df = df.drop(columns=[self.txn_col, self.user_col])
        return df


In [None]:
class LabelEncodeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = {}

    def fit(self, X, y=None):
        for col in X.columns:
            le = LabelEncoder()
            X[col] = X[col].astype(str)
            le.fit(X[col])
            self.encoders[col] = le
        return self

    def transform(self, X):
        X_out = X.copy()
        for col, le in self.encoders.items():
            X_out[col] = le.transform(X_out[col].astype(str))
        return X_out


In [None]:
class UserFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, user_col="User_ID", txn_col="Transaction_ID", amount_col="Transaction_Amount"):
        self.user_col = user_col
        self.txn_col = txn_col
        self.amount_col = amount_col
        # self.time_col = time_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        df[self.time_col] = pd.to_datetime(df[self.time_col])

        # Transactions per user
        df['user_transaction_count'] = df.groupby(self.user_col)[self.txn_col].transform('count')

        # Average transaction amount per user
        df['user_avg_amount'] = df.groupby(self.user_col)[self.amount_col].transform('mean')

        # Std of transaction amount per user
        df['user_amt_std'] = df.groupby(self.user_col)[self.amount_col].transform('std').fillna(0)

        # # Time gap from previous transaction
        # df['time_gap_prev_txn'] = df.groupby(self.user_col)[self.time_col].diff().dt.total_seconds().fillna(0)

        # Drop Transaction_ID and User_ID
        df = df.drop(columns=[self.txn_col, self.user_col])
        return df

# ----------------------------
# 2️⃣ Custom transformer for label encoding categorical columns
# ----------------------------
class LabelEncodeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = {}

    def fit(self, X, y=None):
        for col in X.columns:
            le = LabelEncoder()
            X[col] = X[col].astype(str)
            le.fit(X[col])
            self.encoders[col] = le
        return self

    def transform(self, X):
        X_out = X.copy()
        for col, le in self.encoders.items():
            X_out[col] = le.transform(X_out[col].astype(str))
        return X_out

# ----------------------------
# 3️⃣ Load dataset
# ----------------------------


TARGET = "Fraud_Label"

NUM_FEATURES = [
    "Transaction_Amount", "Account_Balance", "Daily_Transaction_Count",
    "Avg_Transaction_Amount_7d", "Failed_Transaction_Count_7d",
    "Transaction_Distance", "Hour", "Day", "Month", "DayOfWeek",
    "user_transaction_count", "user_avg_amount", "user_amt_std", "time_gap_prev_txn"
]

CAT_FEATURES = [
    "Transaction_Type", "Device_Type", "Location",
    "Merchant_Category", "IP_Address_Flag", "Authentication_Method"
]

X = df.drop(columns=[TARGET])
y = df[TARGET]

# ----------------------------
# 4️⃣ Train-test split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ----------------------------
# 5️⃣ Preprocessing pipeline
# ----------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), NUM_FEATURES),
        ("cat", LabelEncodeTransformer(), CAT_FEATURES)
    ]
)

# ----------------------------
# 6️⃣ Full pipeline with SMOTE (training only)
# ----------------------------
from imblearn.pipeline import Pipeline as ImbPipeline

pipeline_train = ImbPipeline([
    ("user_features", UserFeatureEngineer()),
    ("preprocessing", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("classifier", XGBClassifier(
        n_estimators=500, max_depth=5, learning_rate=0.1, scale_pos_weight=1, use_label_encoder=False, eval_metric='logloss'
    ))
])

# ----------------------------
# 7️⃣ Train model
# ----------------------------
pipeline_train.fit(X_train, y_train)

# ----------------------------
# 8️⃣ Evaluate
# ----------------------------
y_pred = pipeline_train.predict(X_test)
print(classification_report(y_test, y_pred))

# ----------------------------
# 9️⃣ Save pipeline components for inference (no SMOTE)
# ----------------------------
joblib.dump(pipeline_train.named_steps['user_features'], "user_feature_engineer.pkl")
joblib.dump(pipeline_train.named_steps['preprocessing'], "preprocessor.pkl")
joblib.dump(pipeline_train.named_steps['classifier'], "xgb_model.pkl")
print("Saved user_feature_engineer.pkl, preprocessor.pkl, xgb_model.pkl")
