In [96]:
import pandas as pd
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Load the processed dataset
df = pd.read_csv('Dataset/train_processed.csv')
test_df = pd.read_csv('Dataset/test_processed.csv')

bool_cols = df.select_dtypes(include='bool').columns
df[bool_cols] = df[bool_cols].astype(int)
bool_cols_test = test_df.select_dtypes(include='bool').columns
test_df[bool_cols_test] = test_df[bool_cols_test].astype(int)

In [97]:
# Compute pair frequency (A → B)
account_pairs = df.groupby(['From Account', 'To Account']).size().reset_index(name='pair_frequency')

# Compute reverse pair frequency (B → A)
reverse_pairs = account_pairs.copy()
reverse_pairs.columns = ['To Account', 'From Account', 'reverse_pair_frequency']  # flip column names

# Merge both into the original dataframe
df = df.merge(account_pairs, on=['From Account', 'To Account'], how='left')
df = df.merge(reverse_pairs, on=['From Account', 'To Account'], how='left')  # correct merge

# Fill missing frequencies
df['pair_frequency'].fillna(1, inplace=True)
df['reverse_pair_frequency'].fillna(0, inplace=True)

# Flag circular transactions
df['is_circular'] = (df['reverse_pair_frequency'] > 0).astype(int)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['pair_frequency'].fillna(1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['reverse_pair_frequency'].fillna(0, inplace=True)


In [98]:
# Compute pair frequency (A → B)
account_pairs = test_df.groupby(['From Account', 'To Account']).size().reset_index(name='pair_frequency')

# Compute reverse pair frequency (B → A)
reverse_pairs = account_pairs.copy()
reverse_pairs.columns = ['To Account', 'From Account', 'reverse_pair_frequency']  # flip column names

# Merge both into the original dataframe
test_df = test_df.merge(account_pairs, on=['From Account', 'To Account'], how='left')
test_df = test_df.merge(reverse_pairs, on=['From Account', 'To Account'], how='left')  # correct merge

# Fill missing frequencies
test_df['pair_frequency'].fillna(1, inplace=True)
test_df['reverse_pair_frequency'].fillna(0, inplace=True)

# Flag circular transactions
test_df['is_circular'] = (test_df['reverse_pair_frequency'] > 0).astype(int)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['pair_frequency'].fillna(1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['reverse_pair_frequency'].fillna(0, inplace=True)


In [99]:
df = df.drop(columns=['From Account', 'To Account'], errors='ignore')


# Separate features and target
X = df.drop(columns=['Is Laundering'])
y = df['Is Laundering']

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [100]:
#!pip install optuna

In [101]:
test_df = test_df.drop(columns=['From Account', 'To Account'], errors='ignore')

In [None]:
from sklearn.metrics import roc_auc_score, confusion_matrix

# Custom metric functions
def calculate_balanced_accuracy(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    tnr = tn / (tn + fp) if (tn + fp) > 0 else 0
    return (tpr + tnr) / 2

def calculate_fraud_capture_rate(y_true, y_prob, N=485):
    sorted_indices = np.argsort(y_prob)[::-1]
    top_N_indices = sorted_indices[:N]
    frauds_in_top_N = np.sum(y_true.iloc[top_N_indices] if hasattr(y_true, 'iloc') else y_true[top_N_indices])
    total_frauds = np.sum(y_true)
    return frauds_in_top_N / total_frauds if total_frauds > 0 else 0

def calculate_composite_score(y_true, y_pred, y_prob, N=485):
    auc_score = roc_auc_score(y_true, y_prob)
    balanced_acc = calculate_balanced_accuracy(y_true, y_pred)
    fraud_capture = calculate_fraud_capture_rate(y_true, y_prob, N)
    composite = (auc_score + balanced_acc + fraud_capture) / 3
    return composite

In [None]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold


# Optuna objective
def objective(trial):
    scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
    
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.2),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'scale_pos_weight': scale_pos_weight,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'verbosity': 0
    }

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []

    for train_idx, val_idx in kf.split(X_train, y_train):
        X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = xgb.XGBClassifier(**params)
        model.fit(X_fold_train, y_fold_train)

        y_prob = model.predict_proba(X_fold_val)[:, 1]
        y_pred = (y_prob >= 0.5).astype(int)  # default threshold for BA

        composite_score = calculate_composite_score(y_fold_val, y_pred, y_prob, N=485)
        scores.append(composite_score)

    return np.mean(scores)


In [113]:
import optuna

study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=50)

print("Best Hyperparameters:", study.best_params)
print("Best Composite Score:", round(study.best_value, 4))


[I 2025-06-16 03:32:58,484] A new study created in memory with name: no-name-7c41f310-0a8e-4f07-8905-be74ae8ce6a9
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.2),
[I 2025-06-16 03:32:59,735] Trial 0 finished with value: 0.9972583394897313 and parameters: {'n_estimators': 175, 'learning_rate': 0.1540359659501924, 'max_depth': 13, 'min_child_weight': 6, 'gamma': 0.7800932022121826, 'subsample': 0.662397808134481, 'colsample_bytree': 0.6232334448672797, 'reg_alpha': 0.8661761457749352, 'reg_lambda': 0.6011150117432088}. Best is trial 0 with value: 0.9972583394897313.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.2),
[I 2025-06-16 03:33:02,825] Trial 1 finished with value: 0.9969054833232794 and parameters: {'n_estimators': 242, 'learning_rate': 0.0011152328125494347, 'max_depth': 15, 'min_child_weight': 9, 'gamma': 1.0616955533913808, 'subsample': 0.6727299868828402, 'colsample_bytree': 0.6733618039413735, 'reg_alpha': 0.3042422429595377, '

Best Hyperparameters: {'n_estimators': 281, 'learning_rate': 0.13011978474557598, 'max_depth': 6, 'min_child_weight': 9, 'gamma': 4.1747211925729495, 'subsample': 0.7897682171143787, 'colsample_bytree': 0.949562507267186, 'reg_alpha': 0.7590018501969391, 'reg_lambda': 0.7653776085769688}
Best Composite Score: 0.9974


In [115]:
import xgboost as xgb
import numpy as np
import pandas as pd

# Final params with added classification essentials
best_params = {
    'n_estimators': 281,
    'learning_rate': 0.13011978474557598,
    'max_depth': 6,
    'min_child_weight': 9,
    'gamma': 4.1747211925729495,
    'subsample': 0.7897682171143787,
    'colsample_bytree': 0.949562507267186,
    'reg_alpha': 0.7590018501969391,
    'reg_lambda': 0.7653776085769688,
    'scale_pos_weight': len(y_train[y_train == 0]) / len(y_train[y_train == 1]),
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'use_label_encoder': False,
    'verbosity': 0
}

# Train on full training data
final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train, y_train)

# Predict on test
test_prob = final_model.predict_proba(test_df)[:, 1]

# Optional: set a threshold (e.g. 0.2 or tuned one)
threshold = 0.2
test_pred = (test_prob >= threshold).astype(int)


In [116]:
# count how many 1
print("Number of predicted fraud cases:", test_pred.sum())

Number of predicted fraud cases: 307


In [117]:
with open("submission.txt", "w") as f:
    for prob, pred in zip(test_prob, test_pred):
        f.write(f"{prob} {pred}\n")


In [None]:
def objective(trial):
    scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.2),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'num_leaves': trial.suggest_int('num_leaves', 20, 40),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 5),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'max_bin': trial.suggest_int('max_bin', 200, 300),
        'scale_pos_weight': scale_pos_weight,
        'force_col_wise': True,
        'verbosity': -1
    }

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []

    for train_idx, val_idx in kf.split(X_train, y_train):
        X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = lgb.LGBMClassifier(**params)
        model.fit(X_fold_train, y_fold_train)

        y_prob = model.predict_proba(X_fold_val)[:, 1]
        y_pred = (y_prob >= 0.5).astype(int)

        composite_score = calculate_composite_score(y_fold_val, y_pred, y_prob, N=485)
        
        scores.append(composite_score)

    return np.mean(scores)


In [119]:
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=50)

print("✅ Best Hyperparameters:", study.best_params)
print("🎯 Best Composite Score:", round(study.best_value, 4))


[I 2025-06-16 03:37:46,762] A new study created in memory with name: no-name-8de7c679-25d7-4505-8de8-0150a6a6e04a
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.2),
[I 2025-06-16 03:37:47,654] Trial 0 finished with value: 0.9968926892825566 and parameters: {'n_estimators': 175, 'learning_rate': 0.1540359659501924, 'max_depth': 13, 'num_leaves': 32, 'min_child_samples': 16, 'subsample': 0.7467983561008608, 'subsample_freq': 1, 'colsample_bytree': 0.9464704583099741, 'reg_alpha': 0.6011150117432088, 'reg_lambda': 0.7080725777960455, 'max_bin': 202}. Best is trial 0 with value: 0.9968926892825566.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.2),
[I 2025-06-16 03:37:49,094] Trial 1 finished with value: 0.997132727987303 and parameters: {'n_estimators': 294, 'learning_rate': 0.0823143373099555, 'max_depth': 7, 'num_leaves': 23, 'min_child_samples': 17, 'subsample': 0.7912726728878613, 'subsample_freq': 3, 'colsample_bytree': 0.7727780074568463

✅ Best Hyperparameters: {'n_estimators': 163, 'learning_rate': 0.13196806161154936, 'max_depth': 13, 'num_leaves': 34, 'min_child_samples': 44, 'subsample': 0.9135120652889412, 'subsample_freq': 2, 'colsample_bytree': 0.6719726711354403, 'reg_alpha': 0.9831168605749027, 'reg_lambda': 0.7761326578394343, 'max_bin': 269}
🎯 Best Composite Score: 0.9976


In [120]:
best_params

{'n_estimators': 281,
 'learning_rate': 0.13011978474557598,
 'max_depth': 6,
 'min_child_weight': 9,
 'gamma': 4.1747211925729495,
 'subsample': 0.7897682171143787,
 'colsample_bytree': 0.949562507267186,
 'reg_alpha': 0.7590018501969391,
 'reg_lambda': 0.7653776085769688,
 'scale_pos_weight': 79.591985428051,
 'objective': 'binary:logistic',
 'eval_metric': 'logloss',
 'use_label_encoder': False,
 'verbosity': 0}

In [122]:
best_params_lgb = {
    'n_estimators': 281,
    'learning_rate': 0.13011978474557598,
    'max_depth': 6,
    'min_child_weight': 9,  # mapped from XGBoost; not used by LGBM directly
    'subsample': 0.7897682171143787,
    'colsample_bytree': 0.949562507267186,
    'reg_alpha': 0.7590018501969391,
    'reg_lambda': 0.7653776085769688,
    'scale_pos_weight': len(y_train[y_train == 0]) / len(y_train[y_train == 1]),
    'max_bin': 255,  # default, you can tune it if needed
    'subsample_freq': 1,  # recommended for subsample < 1.0
    'force_col_wise': True,
    'verbosity': -1
}

final_lgb_model = lgb.LGBMClassifier(**best_params_lgb)
final_lgb_model.fit(X_train, y_train)


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,6
,learning_rate,0.13011978474557598
,n_estimators,281
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,9


In [None]:
test_prob_lgb = final_lgb_model.predict_proba(test_df)[:, 1]

# Threshold: choose based on your tuning or fixed value (e.g. 0.2)
threshold = 0.2
test_pred_lgb = (test_prob_lgb >= threshold).astype(int)