In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, classification_report
from xgboost import XGBClassifier

# 1. Synthetic data generation
def generate_synthetic_demo(n_samples=100_000, random_state=42):
    np.random.seed(random_state)
    Z = np.random.normal(0, 1, (n_samples, 5))

    times = np.random.choice(np.arange(0, 172800), n_samples, replace=True)
    times.sort()
    Hour = (times // 3600) % 24

    numeric_cols = [
        "Amount", "Age", "Tenure", "MerchantRisk", "DeviceTrust",
        "Txn24h", "Avg30d", "IPReputation", "Latitude", "Longitude", "DistFromHome"
    ]

    loadings = np.random.uniform(-1, 1, (5, len(numeric_cols)))
    num_data = Z.dot(loadings) + np.random.normal(0, 0.5, (n_samples, len(numeric_cols)))
    df_num = pd.DataFrame(num_data, columns=numeric_cols)

    df_num["Amount"] = np.exp(df_num["Amount"] * 0.5 + 3.5)
    df_num["Age"] = np.clip(df_num["Age"] * 5 + 40, 18, 90)
    df_num["Tenure"] = np.abs(df_num["Tenure"] * 10).astype(int)
    df_num["Txn24h"] = np.abs(df_num["Txn24h"].round()).astype(int)
    df_num["Latitude"] = np.clip(37 + df_num["Latitude"]*5, 25, 50)
    df_num["Longitude"] = np.clip(-95 + df_num["Longitude"]*10, -125, -67)

    cat_data = {
        "TxType": np.random.choice(["purchase", "withdrawal", "transfer", "payment"], n_samples, p=[0.7, 0.1, 0.1, 0.1]),
        "DeviceType": np.random.choice(["mobile", "desktop", "ATM", "POS", "web"], n_samples, p=[0.5, 0.2, 0.05, 0.2, 0.05]),
        "MerchantCat": np.random.choice(["grocery", "electronics", "travel", "entertainment", "gas", "restaurant", "utilities", "clothing"], n_samples),
        "Channel": np.random.choice(["online", "in-store", "contactless", "chip"], n_samples, p=[0.4, 0.4, 0.1, 0.1]),
        "CardPresent": np.random.choice([0, 1], n_samples, p=[0.3, 0.7])
    }

    df_cat = pd.DataFrame(cat_data)

    df = pd.concat([pd.Series(times, name="Time"), df_num, pd.Series(Hour, name="Hour"), df_cat], axis=1)

    fraud_signal = (
        (df["Amount"] > 2000).astype(float)*8 +
        (df["CardPresent"] == 0).astype(float)*3 +
        (df["MerchantRisk"] > 3).astype(float)*2.5 +
        (df["DeviceType"] == "web").astype(float)*2 +
        (df["Channel"] == "online").astype(float)*2 +
        (df["TxType"] == "withdrawal").astype(float)*1.5 +
        (df["Hour"].isin([0,1,2,3,4,23])).astype(float)*1.5 +
        (df["Txn24h"] > 10).astype(float)*1.5 +
        (df["IPReputation"] > 2).astype(float)*1.5
    ) + np.random.normal(0, 0.2, n_samples)

    fraud_prob = 1 / (1 + np.exp(-(-6 + fraud_signal)))
    df["Class"] = (np.random.rand(n_samples) < fraud_prob).astype(int)

    return df.dropna()

# 2. Preprocessing
def preprocess(df):
    X = df.drop(columns=["Class", "Time", "Hour"])
    y = df["Class"]

    numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = X.select_dtypes(include=[object, "category"]).columns.tolist()

    preprocessor = ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

    X_processed = preprocessor.fit_transform(X)
    return X_processed, y, preprocessor

# 3. Train XGBoost with strong performance
def train_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

    model = XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.7,
        colsample_bytree=0.7,
        use_label_encoder=False,
        eval_metric='auc',
        random_state=42
    )

    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    auc = roc_auc_score(y_test, y_pred_proba)
    print(f"AUC: {auc:.4f}")
    print(classification_report(y_test, model.predict(X_test)))

    return model

# Main execution
if __name__ == "__main__":
    df = generate_synthetic_demo()
    X_processed, y, preprocessor = preprocess(df)
    model = train_model(X_processed, y)


AUC: 0.8762
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     18042
           1       0.70      0.22      0.34      1958

    accuracy                           0.91     20000
   macro avg       0.81      0.61      0.65     20000
weighted avg       0.90      0.91      0.89     20000

