In [1]:
import pandas as pd
import numpy as np
import urllib.request
import pickle

import sklearn.model_selection
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, roc_auc_score, accuracy_score ,confusion_matrix
from imblearn.over_sampling import SMOTE

from tqdm import tqdm

In [2]:
RANDOM_STATE = 42

In [3]:
# load data
data = pickle.load(urllib.request.urlopen('https://github.com/euanbrown247/bank_fraud_project/blob/main/X_y.pkl?raw=true'))
X = data[0]
y = data[1]

In [4]:
model_rf = RandomForestClassifier(
    n_estimators=80,
    max_depth=2,
    n_jobs=-1
)

In [7]:
states = range(0,20)
output = []
for n,seed in enumerate(states):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=seed)


    sm = SMOTE(random_state = seed)
    X_res, y_res = sm.fit_resample(X_train, y_train)

    clf = model_rf

    clf.fit(X_res, y_res)

    #clf.predict(X_test)
    clf.score(X_test, y_test)

    pred = clf.predict(X_test)
    scores = {
        'roundn':n,
        'recall':recall_score(y_test,pred).round(3),  
        'AUC': roc_auc_score(y_test,pred).round(3),  
        'acc': accuracy_score(y_test,pred).round(3),
        'confu':confusion_matrix(y_test,pred)}

    #print(scores)
    output.append(scores)

df = pd.DataFrame(output)

df['model'] = df['modeln'].map({0:'gb',1:'rf',2:'dt','n':'naive'})

print(df.groupby('model')[['recall','AUC']].agg(['mean','var']))

recall    0.6100
AUC       0.7412
acc       0.8696
dtype: float64


array([[431462,  63093],
       [  2124,   3321]])

In [8]:
feature_imp = pd.Series(model_rf.feature_importances_,index=X.columns).sort_values(ascending=False)
feature_imp

housing_status_BA                   0.272014
device_os_windows                   0.186848
keep_alive_session_0.0              0.145603
phone_home_valid_0                  0.110354
current_address_months_count        0.070241
email_is_free_1                     0.061223
payment_type_AC                     0.043924
proposed_credit_limit               0.031016
prev_address_months_count           0.017900
prev_address_bin                    0.016047
under_loan_1                        0.010081
customer_age                        0.009712
credit_risk_score                   0.008888
referred_not referred               0.005801
intended_balcon_amount              0.003921
name_email_similarity               0.003276
bank_months_count                   0.003150
income                              0.000000
month_5                             0.000000
days_since_request                  0.000000
zip_count_4w                        0.000000
velocity_24h                        0.000000
bank_branc