In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb 
import os
import numpy as np

In [None]:
# DATA LOADING (Loading all .pkl files from 'data/')
DATA_DIR = 'data/'
TARGET_COLUMN = 'TX_FRAUD' 
# Manual threshold to balance Precision and Recall, reduces False Positives
CLASSIFICATION_THRESHOLD = 0.90 

In [None]:
file_names = [
    '2018-09-01.pkl', '2018-09-02.pkl', '2018-09-03.pkl', '2018-09-04.pkl',
    '2018-09-05.pkl', '2018-09-06.pkl', '2018-09-07.pkl', '2018-09-08.pkl',
    '2018-09-09.pkl', '2018-09-10.pkl', '2018-09-11.pkl', '2018-09-12.pkl',
    '2018-09-13.pkl', '2018-09-14.pkl', '2018-09-15.pkl', '2018-09-16.pkl',
    '2018-09-17.pkl', '2018-09-18.pkl', '2018-09-19.pkl', '2018-09-20.pkl', 
    '2018-09-21.pkl', '2018-09-22.pkl', '2018-09-23.pkl', '2018-09-24.pkl',
    '2018-09-25.pkl', '2018-09-26.pkl', '2018-09-27.pkl', '2018-09-28.pkl',
    '2018-09-29.pkl', '2018-09-30.pkl'
]
# try to create relative path, data/.pkl
file_paths = [os.path.join(DATA_DIR, name) for name in file_names]
list_of_dfs = []
print("Loading Pickle files...")

In [None]:
for filepath in file_paths:
    try:
        daily_df = pd.read_pickle(filepath)
        list_of_dfs.append(daily_df)
    except Exception as e:
        print(f"ERROR reading {filepath}: {e}.")

In [None]:
df = pd.concat(list_of_dfs, ignore_index=True)

In [None]:
if df.empty:
    print("\nFatal Error: Could not load any data. Exiting.")
    exit()

In [None]:
print("All data loaded and combined successfully.")
print(f"Total transactions loaded: {len(df)}")
print("-" * 30)

In [None]:
# Convert object columns to numeric types for XGBoost
COLS_TO_CONVERT = ['CUSTOMER_ID', 'TERMINAL_ID', 'TX_AMOUNT', 'TX_TIME_SECONDS', 'TX_TIME_DAYS']
for col in COLS_TO_CONVERT:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df.fillna(0, inplace=True)

In [None]:
# Define features (X) and target (y)
columns_to_drop = [TARGET_COLUMN, 'TRANSACTION_ID', 'TX_DATETIME', 'TX_FRAUD_SCENARIO']
X = df.drop(columns=columns_to_drop, errors='ignore')
y = df[TARGET_COLUMN].astype(int)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
# Calculate the ratio to weight the positive (Fraud) class
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)

In [None]:
# Initialize the XGBoost Classifier
model = xgb.XGBClassifier(
    n_estimators=100,
    random_state=42,
    # use_label_encoder=False, 
    eval_metric='logloss', 
    scale_pos_weight=ratio,     # Weights fraud samples highly (boosts Recall)
    n_jobs=-1,                  # Uses all CPU cores (boosts Speed)
    tree_method='hist',         # Faster method
    learning_rate=0.1,          
    max_depth=5                 
)
# MODEL TRAINING
print("Training the XGBoost model...")
model.fit(X_train, y_train)
print("Training complete! XGBoost model fitted successfully.")
print("-" * 30)

In [None]:
# Get probabilities for the test set
y_proba = model.predict_proba(X_test)[:, 1]

In [None]:
# uses classification threshold value to generate the final prediction
y_pred_tuned = (y_proba >= CLASSIFICATION_THRESHOLD).astype(int)

In [None]:
print(f" Model Evaluation (XGBoost) - THRESHOLD: {CLASSIFICATION_THRESHOLD}")
print("Classification Report:\n", classification_report(y_test, y_pred_tuned, target_names=['Not Fraud (0)', 'Fraud (1)']))
print("-" * 30)

In [None]:
# PREDICTION ON NEW DATA
new_data = pd.DataFrame({
    'CUSTOMER_ID': [1001, 1002, 1003, 1004],
    'TERMINAL_ID': [50, 12, 99, 50],
    'TX_AMOUNT': [6500.00, 50.50, 15000.00, 95.00],
    'TX_TIME_SECONDS': [50.0, 3600.0, 10.0, 7500.0],
    'TX_TIME_DAYS': [11, 11, 11, 11]
})

In [None]:
# Ensure the new data columns are also numeric
for col in COLS_TO_CONVERT:
    new_data[col] = pd.to_numeric(new_data[col], errors='coerce').fillna(0)

In [None]:
# Use the trained model to get probabilities for new data
new_probabilities = model.predict_proba(new_data)[:, 1] 

In [None]:
# Apply the manual threshold for final prediction
new_predictions_tuned = (new_probabilities >= CLASSIFICATION_THRESHOLD).astype(int)

In [None]:
new_data['Predicted_Fraud'] = new_predictions_tuned
new_data['Fraud_Probability'] = new_probabilities

In [None]:
# Display the results
print("Prediction Results (XGBoost Tuned)")
print(new_data[['TX_AMOUNT', 'Predicted_Fraud', 'Fraud_Probability']])

In [None]:
print(f"\nInterpretation (Threshold {CLASSIFICATION_THRESHOLD}):")
print(f"Transaction 1 (Amount {new_data.iloc[0]['TX_AMOUNT']:.2f}): Predicted {'FRAUD' if new_predictions_tuned[0] == 1 else 'NOT FRAUD'} with a probability of {new_probabilities[0]:.2f}")
print(f"Transaction 3 (Amount {new_data.iloc[2]['TX_AMOUNT']:.2f}): Predicted {'FRAUD' if new_predictions_tuned[2] == 1 else 'NOT FRAUD'} with a probability of {new_probabilities[2]:.2f}")