In [1]:
import pandas as pd
import glob as glob
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_fscore_support, average_precision_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
files = glob.glob("dataset/data/*.pkl")

data = pd.concat([pd.read_pickle(f) for f in files], ignore_index=True)
print(f"Total rows loaded: {len(data):,}")
data.to_csv('output.csv', index=False)
data.head()

Total rows loaded: 1,754,155


Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO
0,0,2018-04-01 00:00:31,596,3156,57.16,31,0,0,0
1,1,2018-04-01 00:02:10,4961,3412,81.51,130,0,0,0
2,2,2018-04-01 00:07:56,2,1365,146.0,476,0,0,0
3,3,2018-04-01 00:09:29,4128,8737,64.49,569,0,0,0
4,4,2018-04-01 00:10:34,927,9906,50.99,634,0,0,0


In [3]:
# Feature engineering

data['TX_DATETIME'] = pd.to_datetime(data['TX_DATETIME'])

data['hour'] = data['TX_DATETIME'].dt.hour

data['day_of_week'] = data['TX_DATETIME'].dt.dayofweek

data['is_weekend'] = data['day_of_week'].isin([5, 6]).astype(int)

data['customer_tx_count'] = data.groupby('CUSTOMER_ID')['TRANSACTION_ID'].transform('count')

data['terminal_tx_count'] = data.groupby('TERMINAL_ID')['TRANSACTION_ID'].transform('count')

data['log_tx_amount'] = np.log1p(data['TX_AMOUNT'])

data = data.sort_values(['CUSTOMER_ID', 'TX_DATETIME'])

data['time_since_last_tx'] = data.groupby('CUSTOMER_ID')['TX_DATETIME'].diff().dt.total_seconds().fillna(0)

data['avg_customer_tx_amount'] = data.groupby('CUSTOMER_ID')['TX_AMOUNT'].transform('mean')

data['amount_deviation'] = data['TX_AMOUNT'] / (data['avg_customer_tx_amount'] + 1e-6)

terminal_fraud_rate = data.groupby('TERMINAL_ID')['TX_FRAUD'].mean()
data['terminal_fraud_rate'] = data['TERMINAL_ID'].map(terminal_fraud_rate)

customer_fraud_rate = data.groupby('CUSTOMER_ID')['TX_FRAUD'].mean()
data['customer_fraud_rate'] = data['CUSTOMER_ID'].map(customer_fraud_rate)

customer_std = data.groupby('CUSTOMER_ID')['TX_AMOUNT'].transform('std')

data['amount_outlier'] = ((data['TX_AMOUNT'] - data['avg_customer_tx_amount']).abs() > 2 * customer_std).astype(int)

def compute_tx_count_10min(group):
    group = group.set_index('TX_DATETIME')
    group['tx_count_10min'] = group['TRANSACTION_ID'].rolling('10min', closed='both').count()
    return group.reset_index()
train_data = data.groupby('CUSTOMER_ID').apply(compute_tx_count_10min).reset_index(drop=True)
data['tx_count_10min'] = train_data['tx_count_10min'].fillna(1)

data['amount_change'] = train_data.groupby('CUSTOMER_ID')['TX_AMOUNT'].transform(
    lambda x: x.diff().fillna(0)
)

median_terminal_amount = train_data.groupby('TERMINAL_ID')['TX_AMOUNT'].median()
data['terminal_amount_dev_median'] = train_data['TX_AMOUNT'] / (train_data['TERMINAL_ID'].map(median_terminal_amount) + 1e-6)

customer_avg_time = train_data.groupby('CUSTOMER_ID')['time_since_last_tx'].mean()
data['tx_interval_deviation'] = train_data['time_since_last_tx'] / (train_data['CUSTOMER_ID'].map(customer_avg_time) + 1e-6)

high_risk_hours = train_data.groupby('hour')['TX_FRAUD'].mean().nlargest(6).index
data['is_high_risk_hour'] = train_data['hour'].isin(high_risk_hours).astype(int)

In [4]:
X = data.drop(['TRANSACTION_ID', 'TX_DATETIME', 'TX_TIME_SECONDS', 'TX_FRAUD', 'TX_FRAUD_SCENARIO','CUSTOMER_ID', 'TERMINAL_ID'], axis=1)
y = data['TX_FRAUD']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
model = XGBClassifier(scale_pos_weight=3,
                      random_state=42,
                      n_estimators=450,
                      max_depth=7,
                      subsample=0.78,
                      learning_rate=0.26,
                      colsample_bytree=1,
                      eval_metric='logloss')

model.fit(X_train_scaled, y_train)

In [9]:
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

In [10]:
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve, roc_auc_score


precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)


precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_pred_proba)
pr_auc = np.trapz(precision_vals, recall_vals)


roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1: {f1:.4f}")
print(f"PR-AUC: {pr_auc:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

Precision: 0.7938
Recall: 0.5613
F1: 0.6576
PR-AUC: -0.7621
ROC-AUC: 0.9950


In [11]:
import joblib


joblib.dump(model, 'xgb_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [12]:
# Precompute label-dependent features from training data
terminal_fraud_rate = data.groupby('TERMINAL_ID')['TX_FRAUD'].mean().to_dict()
customer_fraud_rate = data.groupby('CUSTOMER_ID')['TX_FRAUD'].mean().to_dict()
high_risk_hours = data.groupby('hour')['TX_FRAUD'].mean().nlargest(6).index.tolist()

# Precompute other aggregates (label-independent) for lookups
customer_tx_count_dict = data.groupby('CUSTOMER_ID')['TRANSACTION_ID'].count().to_dict()
terminal_tx_count_dict = data.groupby('TERMINAL_ID')['TRANSACTION_ID'].count().to_dict()
avg_customer_tx_amount_dict = data.groupby('CUSTOMER_ID')['TX_AMOUNT'].mean().to_dict()
customer_std_dict = data.groupby('CUSTOMER_ID')['TX_AMOUNT'].std().fillna(0).to_dict()
median_terminal_amount_dict = data.groupby('TERMINAL_ID')['TX_AMOUNT'].median().to_dict()
customer_avg_time_dict = data.groupby('CUSTOMER_ID')['time_since_last_tx'].mean().fillna(0).to_dict()

# Save all precomputed lookups
joblib.dump(terminal_fraud_rate, 'terminal_fraud_rate.pkl')
joblib.dump(customer_fraud_rate, 'customer_fraud_rate.pkl')
joblib.dump(high_risk_hours, 'high_risk_hours.pkl')
joblib.dump(customer_tx_count_dict, 'customer_tx_count_dict.pkl')
joblib.dump(terminal_tx_count_dict, 'terminal_tx_count_dict.pkl')
joblib.dump(avg_customer_tx_amount_dict, 'avg_customer_tx_amount_dict.pkl')
joblib.dump(customer_std_dict, 'customer_std_dict.pkl')
joblib.dump(median_terminal_amount_dict, 'median_terminal_amount_dict.pkl')
joblib.dump(customer_avg_time_dict, 'customer_avg_time_dict.pkl')

# Also save feature names for reference (from X.columns)
feature_names = X.columns.tolist()
joblib.dump(feature_names, 'feature_names.pkl')

['feature_names.pkl']