In [2]:
print("Starting setups...")
import sys
import os

dir_path = os.getcwd()
parent_dir = os.path.dirname(dir_path)
home_dir = os.path.dirname(parent_dir)
print("dir_path is", dir_path)
print("parent_dir is", parent_dir)
print("home_dir is", home_dir)

import yaml
print(home_dir+'/params.yaml')
with open(home_dir+'/params.yaml', 'r') as file:
    params = yaml.safe_load(file)
print('params:', params)

data_folder = home_dir+params['data_location']
print('Data is stored at', data_folder)

with open(dir_path+"/params.yaml", "r") as file:
    config = yaml.safe_load(file)

print(f"Current configuration is: {config}")

Starting setups...
dir_path is /Users/zoe/Documents/Bank-account-fraud/code/model_tuning
parent_dir is /Users/zoe/Documents/Bank-account-fraud/code
home_dir is /Users/zoe/Documents/Bank-account-fraud
/Users/zoe/Documents/Bank-account-fraud/params.yaml
params: {'data_location': '/data', 'output_location': '/output', 'code_location': '/code'}
Data is stored at /Users/zoe/Documents/Bank-account-fraud/data
Current configuration is: {'tuning': {'method': 'random_search', 'random_search': {'param_distributions': {'n_estimators': [50, 100, 200, 500], 'learning_rate': [0.01, 0.05, 0.1, 0.2], 'max_depth': [3, 5, 7, 9], 'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0]}, 'n_iter': 50, 'cv': 3, 'scoring': 'roc_auc', 'random_state': 42}, 'grid_search': {'param_grid': {'n_estimators': [100, 200, 300], 'learning_rate': [0.01, 0.05, 0.1], 'max_depth': [3, 6, 9], 'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0]}, 'cv': 5, 'scoring': 'roc_auc'}, 'bayesian_optimization':

In [4]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 500)
import warnings as wr
wr.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

import xgboost as xgb
from xgboost import XGBClassifier

# sys.path.append(os.path.abspath("model_tuning"))
# from functions import *

In [5]:
print("Reading training and test data...")
X_train = pd.read_csv(f"{home_dir}/data/x_train_data.csv", index_col=0)
y_train = pd.read_csv(f"{home_dir}/data/y_train_data.csv", index_col=0)
X_test = pd.read_csv(f"{home_dir}/data/x_test_data.csv", index_col=0)
y_test = pd.read_csv(f"{home_dir}/data/y_test_data.csv", index_col=0)

# X_train_smote = pd.read_csv(f"{parent_dir}/data/x_train_data_smote.csv", index_col=0)
# y_train_smote = pd.read_csv(f"{parent_dir}/data/y_train_data_smote.csv", index_col=0)

Reading training and test data...


In [11]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Random search

In [36]:
import csv

# Create file and open connection
out_file = 'random_search_trials.csv'
with open(out_file, 'w', newline='') as of_connection:
    writer = csv.writer(of_connection)
    # Write column names
    headers = ['score', 'params', 'iterations']
    writer.writerow(headers)

In [37]:
def objective(hyperparameters, dtrain, iteration):
    """Objective function for grid and random search. Returns
       the cross validation score from a set of hyperparameters."""
    
    # Perform n_folds cross validation
    cv_results = xgb.cv(params = hyperparameters, 
                        dtrain = dtrain, 
                        num_boost_round = 10000, nfold = 10, 
                        early_stopping_rounds = 3, metrics = 'auc', 
                        seed = 111,
                        verbose_eval=2, maximize=True)
    
    # results to retun
    score = cv_results['test-auc-mean'].max()
    # estimators = len(cv_results['test-auc-mean'])
    # hyperparameters['n_estimators'] = estimators 
    
    return [cv_results, score, hyperparameters, iteration]

In [38]:
def random_search(dtrain, out_file, iteration=1):
    best_auc = 0
    best_params = {}
    results = pd.DataFrame(columns = ['score', 'params', 'iterations'], 
                          index = list(range(iteration)))
    
    for i in range(iteration):
        np.random.seed(None)
        param_grid = {
            'eta': np.random.uniform(0.01, 0.6),
            'lambda': np.random.uniform(0.01, 0.2),
            'alpha': np.random.uniform(0.01, 0.2),
            'gamma': np.random.uniform(0, 20),
            'max_depth': np.random.randint(3, 15),
            'subsample': np.random.uniform(0.5, 1),
            'colsample_bytree': np.random.uniform(0.5, 1),
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'min_child_weight': np.random.uniform(0.8, 1.2)
        }
        
        print(f"Iteration {i} - Params: {param_grid}")

        random_cv = objective(param_grid, dtrain, iteration=i)
        results.loc[i, :] = {
            "score": random_cv[1], 
            "params": random_cv[2], 
            "iterations": random_cv[3]
        }

        with open(out_file, 'a', newline='') as of_connection:
            writer = csv.writer(of_connection)
            writer.writerow([random_cv[1], random_cv[2], random_cv[3]])
    
        auc_score = random_cv[1]
        
        if auc_score > best_auc:
            best_auc = auc_score
            best_params = random_cv[2]

    results.sort_values('score', ascending = False, inplace = True)
    results.reset_index(inplace = True)

    return results, best_params, best_auc

In [None]:
randome_search_result, random_best_params, random_best_auc = random_search(dtrain, out_file, iteration=100)

Iteration 0 - Params: {'eta': 0.46721805598925914, 'lambda': 0.09510100434212998, 'alpha': 0.17309968933620054, 'gamma': 11.608830146061887, 'max_depth': 13, 'subsample': 0.5565887710699626, 'colsample_bytree': 0.8306399829150326, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'min_child_weight': 0.9100921662931908}
[0]	train-auc:0.98577+0.00023	test-auc:0.98518+0.00057
[2]	train-auc:0.99240+0.00016	test-auc:0.99193+0.00023
[4]	train-auc:0.99502+0.00021	test-auc:0.99463+0.00024
[6]	train-auc:0.99642+0.00013	test-auc:0.99602+0.00024
[8]	train-auc:0.99705+0.00008	test-auc:0.99666+0.00018
[10]	train-auc:0.99748+0.00008	test-auc:0.99709+0.00016
[12]	train-auc:0.99773+0.00006	test-auc:0.99733+0.00012
[14]	train-auc:0.99791+0.00005	test-auc:0.99750+0.00010
[16]	train-auc:0.99801+0.00005	test-auc:0.99760+0.00011
[18]	train-auc:0.99810+0.00004	test-auc:0.99767+0.00011
[20]	train-auc:0.99817+0.00003	test-auc:0.99773+0.00012
[22]	train-auc:0.99822+0.00002	test-auc:0.99778+0.00011
[24]	tra

In [34]:
randome_search_result

Unnamed: 0,index,score,params,iterations
0,3,0.995739,"{'eta': 0.4962342211782816, 'lambda': 0.036389...",3
1,8,0.993137,"{'eta': 0.014528320923566096, 'lambda': 0.1416...",8
2,4,0.991287,"{'eta': 0.28930043606416267, 'lambda': 0.14398...",4
3,6,0.991038,"{'eta': 0.45219840496816394, 'lambda': 0.16377...",6
4,0,0.988727,"{'eta': 0.4631824758162296, 'lambda': 0.112091...",0
5,7,0.985885,"{'eta': 0.42609232676231074, 'lambda': 0.06385...",7
6,5,0.984544,"{'eta': 0.5053953493001191, 'lambda': 0.038656...",5
7,9,0.982758,"{'eta': 0.3687536141200702, 'lambda': 0.090088...",9
8,2,0.95117,"{'eta': 0.14273993470346705, 'lambda': 0.08265...",2
9,1,0.931302,"{'eta': 0.13571387896278483, 'lambda': 0.18839...",1


In [41]:
print(random_best_params, random_best_auc)

{'eta': 0.1293721954492453, 'lambda': 0.10185353251947578, 'alpha': 0.19993814693211895, 'gamma': 1.1654553061408257, 'max_depth': 7, 'subsample': 0.6695264268351777, 'colsample_bytree': 0.8312292702925602, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'min_child_weight': 1.0577427635367276} 0.9984456032747424


In [83]:
final_random_search_model = fit_model(random_best_params, dtrain, dtest)

[0]	training-auc:0.94961	testing-auc:0.79071
[2]	training-auc:0.96019	testing-auc:0.80338
[4]	training-auc:0.96548	testing-auc:0.80305
[6]	training-auc:0.96936	testing-auc:0.80759
[8]	training-auc:0.96892	testing-auc:0.80711
[10]	training-auc:0.97048	testing-auc:0.81218
[12]	training-auc:0.97067	testing-auc:0.81332
[14]	training-auc:0.97261	testing-auc:0.81505
[16]	training-auc:0.97340	testing-auc:0.81648
[18]	training-auc:0.97428	testing-auc:0.81798
[20]	training-auc:0.97565	testing-auc:0.82004
[22]	training-auc:0.97672	testing-auc:0.82116
[24]	training-auc:0.97896	testing-auc:0.82451
[26]	training-auc:0.97966	testing-auc:0.82451
[28]	training-auc:0.98073	testing-auc:0.82586
[30]	training-auc:0.98174	testing-auc:0.82602
[32]	training-auc:0.98289	testing-auc:0.82744
[34]	training-auc:0.98399	testing-auc:0.82844
[36]	training-auc:0.98483	testing-auc:0.82904
[38]	training-auc:0.98554	testing-auc:0.82975
[40]	training-auc:0.98616	testing-auc:0.83081
[42]	training-auc:0.98716	testing-auc:0

In [95]:
random_search_perfomance = evaluate_model_performance(final_random_search_model, 
                                              X_test, y_test, 
                                              threshold=0.5, 
                                              plot_roc_curve=True, plot_confusion_matrix=True)
print(random_search_perfomance)

Recall (TPR): 0.2475
False Positive Rate (FPR): 0.0213
ROC AUC: 0.84091
