# Random forest classifier introduction
* this is  a ensemble bagging method coming from sklearn
* there is risk of overfitting 
* to avoid overfitting the max_depth must not be bigger than 15
* Because it is a bagging method good result is not expected for imbalanced data
* Does not need scaling or normalisation

In [None]:
#For Kaggle
#date=''

#For WL data
source='WL'
date='20241118'

saveImage=False

packageName='07-sklearn.ensemble.a-bagging'
classifierName='2-RandomForestClassifier'
extraParameterName='1-Default'

print('done')

In [None]:
import sys
import os
from importlib import reload
fpath = os.path.join('..//scripts')
sys.path.append(fpath)

import warnings
warnings.filterwarnings('ignore')

#loading internal scripts
import frauddetection as fd
import sourcedata as sd
import dataimport as di
import result as resultMd
reload(fd)
reload(sd)
reload(di)
reload(resultMd)

print('done')

In [None]:
# Hyperparameters Results

n_estimatorsFound=37
max_depthFound=31

resultMd.update_hyperparameter_config_result(packageName,classifierName,extraParameterName,max_depthFound,n_estimatorsFound)


print('done')

In [None]:
import pandas as pd

dfTrx0 = pd.read_csv('../data/cleaned/'+source+'export'+date+'.csv')
predictors = fd.getPredictors(dfTrx0)
dfTrx=fd.getStandardScaledData('export'+date+'.csv',source,predictors)
dfTrx.head(5)

# Test with default hyperparameters

In [None]:
%%script false

from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from datetime import datetime

then= datetime.now()
predictors = fd.getPredictors(dfTrx0)

x_train, x_test, y_train, y_test, scaler =fd.split_data(dfTrx,predictors)

modelClf = RandomForestClassifier(random_state=42)
modelClf.fit(x_train, y_train)
predsTrain = modelClf.predict(x_train)
predsTest = modelClf.predict(x_test)

now = datetime.now()
duration= now - then
duration_in_s = duration.total_seconds()
print("Duration ",duration_in_s)
resultMd.update_time_response_result('07-sklearn.ensemble.a-bagging','RandomForest','1-Default', duration_in_s)
f1,mcc,roc=fd.print_scores(y_test, predsTest,'all')
resultMd.update_performance_test_result('07-sklearn.ensemble-a-bagging','RandomForest','1-Default', f1,mcc,roc)

fd.print_scores(y_train, predsTrain,'f1', False)
fd.print_scores(y_test, predsTest,'f1')
fd.show_importance(modelClf, predictors)
fd.show_confusion_matrix(y_test, predsTest)


In [None]:
%%script false
import numpy as np

depth=[]
for tree in modelClf.estimators_:
    depth=np.append(depth, tree.get_depth())
print(np.max(depth))

# Default parameters conclusion
Overfitting probably due to the default parameter max_depth=None

# Hyperparameters tuning

In [None]:
%%script false

from scipy.stats import randint

modelClf = RandomForestClassifier(random_state=42)
dic_param={
    'n_estimators': randint(35,90),
    'max_depth': randint(17,33)
}
res=fd.hyperparameterSelectionRandomizedSearchCV(modelClf, dic_param, 'f1', dfTrx, predictors, [], None,iter=4)
print(res)

#[CV 5/5; 10/10] END max_depth=20, n_estimators=106;, score=0.121 total time= 2.8min
#{'max_depth': 22, 'n_estimators': 60}
#0.16552081045379466
#score   0.8500551267916208
#scoref1 0.8500551267916208

#{'max_depth': 29, 'n_estimators': 46}
#0.20666197372708545
#score   0.9669135802469135
#scoref1 0.9669135802469135


In [None]:
%%script false

from sklearn.ensemble import RandomForestClassifier

modelClf = RandomForestClassifier(random_state=42)

dic_param={
    'n_estimators':[36,37,38],
    'max_depth':[30,31,32]
}
res=fd.hyperparameterSelectionGridSearchCV(modelClf, dic_param, 'f1', dfTrx, predictors, [], None)
print(res)



#{'max_depth': 32, 'n_estimators': 39}
#0.21652747099234754
#scoref1 0.9776264591439688

#[CV 5/5; 25/25] END max_depth=34, n_estimators=41;, score=0.221 total time= 1.6min
#{'max_depth': 32, 'n_estimators': 37}
#0.21685312600926157
#scoref1 0.9755620723362659

#{'max_depth': 31, 'n_estimators': 37}
#0.21358217338757218
#scoref1 0.9724950884086444


In [None]:
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from datetime import datetime

then= datetime.now()
predictors = fd.getPredictors(dfTrx0)

x_train, x_test, y_train, y_test, scaler =fd.split_data(dfTrx,predictors)

modelClf = RandomForestClassifier(random_state=42)
parameters={'max_depth': max_depthFound, 'n_estimators':n_estimatorsFound}
modelClf.set_params(**parameters)

modelClf.fit(x_train, y_train)
predsTrain = modelClf.predict(x_train)
predsTest = modelClf.predict(x_test)

now = datetime.now()
duration= now - then
duration_in_s = duration.total_seconds()
print("Duration ",duration_in_s)
resultMd.update_time_response_result('07-sklearn.ensemble.a-bagging','RandomForest','2-After tuning', duration_in_s)
f1,mcc,roc=fd.print_scores(y_test, predsTest,'all')
resultMd.update_performance_test_result('07-sklearn.ensemble.a-bagging','RandomForest','2-After tuning', f1,mcc,roc)

f1Train=fd.print_scores(y_train, predsTrain,'f1', False)
f1Test=fd.print_scores(y_test, predsTest,'f1')
fd.show_importance(modelClf, predictors)
fd.show_confusion_matrix(y_test, predsTest)
fd.show_prediction_graph(modelClf, x_test,y_test,'../imgs/FD07A-RandomForestClassifierProbaHistogram' if saveImage else None)

diffF1=f1Train-f1Test
print("diffF1",diffF1)

tmp = pd.DataFrame({'Feature': predictors, 'Feature importance': modelClf.feature_importances_})
val=tmp[(tmp['Feature']=='trx_amount_log10')]['Feature importance']
amountImportance =val.values[0]
print("amount importance",amountImportance)

In [None]:
files = fd.getAllFiles()
predictors = fd.getPredictors(dfTrx0)

range = []
f1s = []
rocs = []
loop =0
for file in files:
    loop=loop+1
    range.append(loop)
    print(file)
  
    dfTrx=fd.getStandardScaledData(file,source,predictors)
    preds = modelClf.predict(dfTrx[predictors])

    f1,mcc,roc= fd.print_scores(dfTrx['Class'], preds,'All', True)
    #fd.show_importance(modelClf,predictors)
    fd.show_confusion_matrix(dfTrx['Class'], preds)
    f1s.append(f1)
    rocs.append(roc)

fd.plt_train_test(range, f1s)
resultMd.update_performance_nextdays_result(packageName,classifierName,extraParameterName, f1s[0],f1s[1],f1s[2],f1s[3],rocs[0],rocs[1],rocs[2],rocs[3],diffF1,amountImportance)


# 