In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import class_weight
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, confusion_matrix 
import joblib
from imblearn.over_sampling import SMOTE

In [2]:
# read in data 
df = pd.read_csv('../../../data/model_data.csv')
df.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,9839.64,170136.0,160296.36,0.0,0.0,0,0,0,0,1,0
1,1864.28,21249.0,19384.72,0.0,0.0,0,0,0,0,1,0
2,181.0,181.0,0.0,0.0,0.0,1,0,0,0,0,1
3,181.0,181.0,0.0,21182.0,0.0,1,0,1,0,0,0
4,11668.14,41554.0,29885.86,0.0,0.0,0,0,0,0,1,0


In [3]:
# split into label (isFraud) and data, drop label from data 
y = df['isFraud'].to_numpy()

df.drop(columns=['isFraud'], inplace=True)
X = df.to_numpy()

In [4]:
# scale values
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
y = scaler.fit_transform(y.reshape(-1,1))

In [5]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# oversample using smote to account for class imbalance
oversample = SMOTE(random_state=1)
X_train, y_train = oversample.fit_resample(X_train,y_train)

In [6]:
# create models
xgb = xgb.XGBClassifier(objective='binary:logistic', random_state=1)
dt = DecisionTreeClassifier(random_state=1)
rf = RandomForestClassifier(random_state=1)

# create parameter grids
xgb_param_grid = {
    'n_estimators': [50, 100],     
    'max_depth': [None, 4],              
    'learning_rate': [0.1, 0.01]  
}

dt_param_grid = {    
    'max_depth': [None, 4],               
}

rf_param_grid = {
    'n_estimators': [50, 100], 
    'max_depth': [None, 4]              
}

scoring = {
    'precision': make_scorer(precision_score),
    'accuracy': make_scorer(accuracy_score),
    'recall': make_scorer(recall_score)
}

# create dictionary of models and their parameter grids
models_params = {xgb: xgb_param_grid, dt: dt_param_grid, rf: rf_param_grid}

In [8]:
# function for training, evaluating, and storing models
def perform_testing(models_params, X_train, y_train, X_test, y_test):
    results = {}

    for model, param_grid in models_params.items():
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, refit='accuracy')
        grid_search.fit(X_train, y_train)

        best_params = grid_search.best_params_
        best_score = grid_search.best_score_

        # create best model from best_params
        best_model = model.set_params(**best_params)

        # train the model
        best_model.fit(X_train, y_train)

        # evaluate the model
        y_pred = best_model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        
        # export the model
        model_file = '../../../trained_models/' + model.__class__.__name__ + '.pkl'
        joblib.dump(best_model, model_file)
        
        # store results
        results[model.__class__.__name__] = {'best_params': best_params, 'best_score': best_score,
                                             'accuracy': accuracy, 'precision': precision, 'recall': recall}

    return results

perform_testing(models_params, X_train, y_train, X_test, y_test)

{'XGBClassifier': {'best_params': {'learning_rate': 0.1,
   'max_depth': None,
   'n_estimators': 100},
  'best_score': 0.9956776943874553,
  'accuracy': 0.9933973740377392,
  'precision': 0.16344557348741448,
  'recall': 0.997163695299838},
 'DecisionTreeClassifier': {'best_params': {'max_depth': None},
  'best_score': 0.9791369952396087,
  'accuracy': 0.9992361637187196,
  'precision': 0.6494967436352872,
  'recall': 0.8889789303079416},
 'RandomForestClassifier': {'best_params': {'max_depth': None,
   'n_estimators': 100},
  'best_score': 0.9991723412104591,
  'accuracy': 0.9993723759499493,
  'precision': 0.6984375,
  'recall': 0.9055915721231766}}