# Stratified train/test split for transactions

This notebook reads `dataset/transactions.csv`, sets `X` and `y` (where `y = df["isFraud"]`), and creates a stratified train/test split that preserves the class percentages of `isFraud`.

It prints shapes and class distributions for verification.

In [14]:
# Imports
import os
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, recall_score, precision_score, f1_score, confusion_matrix

# Path to CSV (relative to this notebook)
csv_path = 'dataset/transactions.csv'

if not os.path.exists(csv_path):
    raise FileNotFoundError(f'File not found: {csv_path} - make sure you run the notebook from the repository root or adjust the path.')

# Read the dataset
df = pd.read_csv(csv_path)
print('Loaded dataset with shape:', df.shape)

# Drop all null columns
df = df.drop(["Unnamed: 0", "echoBuffer", "merchantCity", "merchantState", "merchantZip", "posOnPremises", "recurringAuthInd"], axis=1)

df.head()
df.info()
df.nunique()

Loaded dataset with shape: (786363, 30)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786363 entries, 0 to 786362
Data columns (total 23 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   accountNumber             786363 non-null  int64  
 1   customerId                786363 non-null  int64  
 2   creditLimit               786363 non-null  int64  
 3   availableMoney            786363 non-null  float64
 4   transactionDateTime       786363 non-null  object 
 5   transactionAmount         786363 non-null  float64
 6   merchantName              786363 non-null  object 
 7   acqCountry                781801 non-null  object 
 8   merchantCountryCode       785639 non-null  object 
 9   posEntryMode              782309 non-null  float64
 10  posConditionCode          785954 non-null  float64
 11  merchantCategoryCode      786363 non-null  object 
 12  currentExpDate            786363 non-null  object 
 13  acco

accountNumber                 5000
customerId                    5000
creditLimit                     10
availableMoney              521861
transactionDateTime         776637
transactionAmount            66038
merchantName                  2490
acqCountry                       4
merchantCountryCode              4
posEntryMode                     5
posConditionCode                 3
merchantCategoryCode            19
currentExpDate                 165
accountOpenDate               1820
dateOfLastAddressChange       2184
cardCVV                        899
enteredCVV                     976
cardLast4Digits               5245
transactionType                  3
currentBalance              487318
cardPresent                      2
expirationDateKeyInMatch         2
isFraud                          2
dtype: int64

In [5]:
# Prepare X and y
if 'isFraud' not in df.columns:
    raise KeyError(
        "Column 'isFraud' not found in dataframe. Check the CSV or column name casing."
    )

y = df['isFraud']
X = df.drop(columns=['isFraud'])

print('X shape:', X.shape)
print('y shape:', y.shape)

# Overall class distribution (percentage)
print('Overall class distribution (%):')
print((y.value_counts(normalize=True) * 100).round(4))

X shape: (786363, 29)
y shape: (786363,)
Overall class distribution (%):
isFraud
False    98.421
True      1.579
Name: proportion, dtype: float64


In [None]:
# Stratified train/test split
# Keep class proportions by using `stratify=y`
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print('Train shapes -> X:', X_train.shape, 'y:', y_train.shape)
print('Test shapes  -> X:', X_test.shape, 'y:', y_test.shape)

print('Train class distribution (%):')
print((y_train.value_counts(normalize=True) * 100).round(4))

print('Test class distribution (%):')
print((y_test.value_counts(normalize=True) * 100).round(4))

Train shapes -> X: (629090, 29) y: (629090,)
Test shapes  -> X: (157273, 29) y: (157273,)
Train class distribution (%):
isFraud
False    98.4209
True      1.5791
Name: proportion, dtype: float64
Test class distribution (%):
isFraud
False    98.4212
True      1.5788
Name: proportion, dtype: float64


In [7]:
# Quick numeric check that proportions are close to overall proportions
print('Per-class proportions (overall vs train vs test):')
for cls in sorted(y.unique()):
    overall = (y == cls).mean()
    train = (y_train == cls).mean()
    test = (y_test == cls).mean()
    print(f'Class {cls}: overall={overall:.6f}, train={train:.6f}, test={test:.6f}')

# Optional: save the splits (uncomment to write)
# X_train.to_csv('dataset/X_train.csv', index=False)
# X_test.to_csv('dataset/X_test.csv', index=False)
# y_train.to_csv('dataset/y_train.csv', index=False)
# y_test.to_csv('dataset/y_test.csv', index=False)

print('Done. If you want the splits written to disk, uncomment the save lines above.')

Per-class proportions (overall vs train vs test):
Class False: overall=0.984210, train=0.984209, test=0.984212
Class True: overall=0.015790, train=0.015791, test=0.015788
Done. If you want the splits written to disk, uncomment the save lines above.


In [None]:
# Train XGBoost with focus on recall
# Calculate scale_pos_weight to handle class imbalance (improves recall for minority class)
neg_count = (y_train == 0).sum()
pos_count = (y_train == 1).sum()
scale_pos_weight = neg_count / pos_count if pos_count > 0 else 1

print(f'Class imbalance ratio (neg/pos): {scale_pos_weight:.2f}')
print(f'Using scale_pos_weight={scale_pos_weight:.2f} to boost recall for fraud class\n')

# Create XGBoost model optimized for recall
# - scale_pos_weight: handles class imbalance
# - max_depth: controls tree depth (lower = simpler, higher = more complex)
# - learning_rate: step size (lower = slower but more precise)
model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,  # Key parameter for recall
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

print('Training XGBoost model...')
model.fit(X_train, y_train)
print('Training complete!\n')

# Predictions on train set
y_train_pred = model.predict(X_train)
print('=== TRAIN SET PERFORMANCE ===')
print(f'Recall (fraud class): {recall_score(y_train, y_train_pred):.4f}')
print(f'Precision (fraud class): {precision_score(y_train, y_train_pred):.4f}')
print(f'F1-Score (fraud class): {f1_score(y_train, y_train_pred):.4f}')
print('\nConfusion Matrix (Train):')
print(confusion_matrix(y_train, y_train_pred))
print('\nClassification Report (Train):')
print(classification_report(y_train, y_train_pred))

# Predictions on test set
y_test_pred = model.predict(X_test)
print('\n=== TEST SET PERFORMANCE ===')
print(f'Recall (fraud class): {recall_score(y_test, y_test_pred):.4f}')
print(f'Precision (fraud class): {precision_score(y_test, y_test_pred):.4f}')
print(f'F1-Score (fraud class): {f1_score(y_test, y_test_pred):.4f}')
print('\nConfusion Matrix (Test):')
print(confusion_matrix(y_test, y_test_pred))
print('\nClassification Report (Test):')
print(classification_report(y_test, y_test_pred))

# Feature importances
if hasattr(X_train, 'columns'):
    import pandas as _pd
    fi = _pd.Series(model.feature_importances_, index=X_train.columns)
    print('\nTop 10 Most Important Features:')
    print(fi.nlargest(10))


Class imbalance ratio (neg/pos): 62.33
Using scale_pos_weight=62.33 to boost recall for fraud class

Training XGBoost model...


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:transactionDateTime: object, merchantName: object, acqCountry: object, merchantCountryCode: object, merchantCategoryCode: object, currentExpDate: object, accountOpenDate: object, dateOfLastAddressChange: object, transactionType: object