# BalancedRandomForestClassifier com target encoder

In [1]:
import pandas as pd
train = pd.read_csv('../../../dataset/german_credit_train.csv')
train = train.drop(columns=['Unnamed: 0'])
train.head()

Unnamed: 0,age,sex,job,housing,savingAccount,checkingAccount,creditAmount,duration,purpose,risk
0,34,male,3,own,moderate,moderate,6850,15,car,bad
1,22,female,2,rent,little,,433,18,radio/TV,bad
2,44,female,3,free,little,moderate,12579,24,car,bad
3,23,male,1,rent,quite rich,,660,6,repairs,good
4,26,female,2,rent,little,,1388,9,furniture/equipment,good


### Transformando dados categóricos em numéricos

In [2]:
cleanup_nums = {"savingAccount": 
                {"little": 1, "moderate": 2, "quite rich": 3, "rich": 4}
               }
train.replace(cleanup_nums, inplace=True)

In [3]:
cleanup_nums = {"checkingAccount": 
                {"little": 1, "moderate": 2, "quite rich": 3, "rich": 4}
               }
train.replace(cleanup_nums, inplace=True)

In [4]:
train.replace({"risk": {"bad": 0, "good": 1}}, inplace=True)

In [5]:
y_train = train['risk']
train = train.drop(columns=['risk'])

### Tratamento dos valores missing - atribuindo valores mais frequentes

In [6]:
train = train.fillna(1)
train.head()

Unnamed: 0,age,sex,job,housing,savingAccount,checkingAccount,creditAmount,duration,purpose
0,34,male,3,own,2.0,2.0,6850,15,car
1,22,female,2,rent,1.0,1.0,433,18,radio/TV
2,44,female,3,free,1.0,2.0,12579,24,car
3,23,male,1,rent,3.0,1.0,660,6,repairs
4,26,female,2,rent,1.0,1.0,1388,9,furniture/equipment


### Definindo a função para aplicar target encorder e outros encoders e executar o cross-validation

É necessário aplicar o target encoder no dataset treinamento duranto o processo de treinamento para evitar data leak. Se aplicarmos no dataset como um todo teremos um problema de data leak.

In [7]:
import warnings
warnings.filterwarnings('ignore')

from imblearn.ensemble import BalancedRandomForestClassifier 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold
import numpy as np

def target_encoder_and_cross_validation(df, y, estimators, label):
    random_seed=1234
    kf = StratifiedKFold(n_splits=5, random_state=random_seed)
    cross_val_f1_score_lst = []

    for train_index_ls, validation_index_ls in kf.split(df, y):
        dftrain, validation = df.iloc[train_index_ls], df.iloc[validation_index_ls]
        target_train, target_val = y.iloc[train_index_ls], y.iloc[validation_index_ls]
        
        dftrain = pd.concat([dftrain, target_train], axis=1)

        sex_enc = dftrain.groupby('sex')['risk'].mean().to_dict()
        housing_enc = dftrain.groupby('housing')['risk'].mean().to_dict()
        purpose_enc = dftrain.groupby('purpose')['risk'].mean().to_dict()    
        dftrain['sex'].replace(sex_enc, inplace=True)
        dftrain['housing'].replace(housing_enc, inplace=True)
        dftrain['purpose'].replace(purpose_enc, inplace=True)
        
        dftrain = dftrain.drop(columns=['risk'])
    
        # training the model on oversampled 4 folds of training set
        rf = BalancedRandomForestClassifier(n_estimators=estimators, random_state=random_seed)
        rf.fit(dftrain, target_train)
        
        # testing on 1 fold of validation set

        validation['sex'].replace(sex_enc, inplace=True)
        validation['housing'].replace(housing_enc, inplace=True)
        validation['purpose'].replace(purpose_enc, inplace=True)
        validation_preds = rf.predict(validation)
        
        cross_val_f1_score_lst.append(f1_score(target_val, validation_preds, pos_label=label))
    return np.mean(cross_val_f1_score_lst), rf, sex_enc, housing_enc, purpose_enc

Using TensorFlow backend.


### Escolhendo os melhores hiperparâmetros para o BalancedRandomForestClassifier

In [8]:
f1score, clf, sex_enc, housing_enc, purpose_enc = target_encoder_and_cross_validation(train, y_train, 700, 1)

In [9]:
print(f1score)
print(clf.feature_importances_)

0.6828416844422767
[0.2024247  0.04765052 0.04953979 0.05298884 0.03913805 0.06065622
 0.26613985 0.18377574 0.09768628]


In [None]:
min_estimators = 100
max_estimators = 4000
step = 100
result = []
for i in range(min_estimators, max_estimators+step, step):
    result.append((i, target_encoder_and_cross_validation(df, y_train, i, 1)))

In [None]:
import matplotlib.pyplot as plt
plt.plot(*zip(*result))

In [None]:
max(result,key=lambda item:item[1])

In [19]:
#import warnings
#warnings.filterwarnings('ignore')
#from imblearn.ensemble import BalancedRandomForestClassifier 
#clf = BalancedRandomForestClassifier(n_estimators=600, max_depth=None, random_state=1234)
#s = make_scorer(f1_score, pos_label=1)
#scores = cross_val_score(clf, df, y_train, cv=5, scoring=s)
#y_pred = cross_val_predict(clf, df, y_train, cv=5)
#print("f1_score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

In [None]:
#from sklearn.metrics import confusion_matrix
#print(confusion_matrix(y_train, y_pred))
#from sklearn.metrics import classification_report, plot_confusion_matrix
#print(classification_report(y_train, y_pred))

In [None]:
#train.head()

In [10]:
train = pd.concat([train, y_train], axis=1)
sex_enc = train.groupby('sex')['risk'].mean().to_dict()
housing_enc = train.groupby('housing')['risk'].mean().to_dict()
purpose_enc = train.groupby('purpose')['risk'].mean().to_dict()    
train['sex'].replace(sex_enc, inplace=True)
train['housing'].replace(housing_enc, inplace=True)
train['purpose'].replace(purpose_enc, inplace=True)
train = train.drop(columns=['risk'])

clf = BalancedRandomForestClassifier(n_estimators=700, random_state=1234)
clf.fit(train, y_train)
print(clf.feature_importances_)
print(train.columns)

[0.20863018 0.04253377 0.05694851 0.0514809  0.03970228 0.05427421
 0.2631686  0.18080285 0.10245872]
Index(['age', 'sex', 'job', 'housing', 'savingAccount', 'checkingAccount',
       'creditAmount', 'duration', 'purpose'],
      dtype='object')


In [None]:
#pred = clf.predict(train)
#from sklearn.metrics import confusion_matrix
#print(confusion_matrix(y_train, pred))
#from sklearn.metrics import classification_report, plot_confusion_matrix
#print(classification_report(y_train, pred))

# Validação final no dataset de validação

Considerando os resultados obtidos até então com este pipeline, vamos fazer a validação final considerando este pipeline.

In [11]:
validacao = pd.read_csv('../../../dataset/german_credit_validation.csv')
validacao = validacao.drop(columns=['Unnamed: 0'])
validacao.head()

Unnamed: 0,age,sex,job,housing,savingAccount,checkingAccount,creditAmount,duration,purpose,risk
0,24,male,2,own,little,,1800,18,radio/TV,good
1,33,female,2,own,,,1927,24,education,good
2,30,male,3,free,,,12680,21,car,bad
3,39,female,2,own,little,moderate,1188,21,business,bad
4,44,male,2,free,little,,5507,24,repairs,good


In [12]:
cleanup_nums = {"savingAccount": 
                {"little": 1, "moderate": 2, "quite rich": 3, "rich": 4}
               }
validacao.replace(cleanup_nums, inplace=True)

cleanup_nums = {"checkingAccount": 
                {"little": 1, "moderate": 2, "quite rich": 3, "rich": 4}
               }
validacao.replace(cleanup_nums, inplace=True)

validacao.replace({"risk": {"good": 0, "bad":1}}, inplace=True)

validacao = validacao.fillna(1)

validacao.head()

Unnamed: 0,age,sex,job,housing,savingAccount,checkingAccount,creditAmount,duration,purpose,risk
0,24,male,2,own,1.0,1.0,1800,18,radio/TV,0
1,33,female,2,own,1.0,1.0,1927,24,education,0
2,30,male,3,free,1.0,1.0,12680,21,car,1
3,39,female,2,own,1.0,2.0,1188,21,business,1
4,44,male,2,free,1.0,1.0,5507,24,repairs,0


In [13]:
print(sex_enc)
print(housing_enc)
print(purpose_enc)

{'female': 0.6263736263736264, 'male': 0.7464114832535885}
{'free': 0.5652173913043478, 'own': 0.7413394919168591, 'rent': 0.673469387755102}
{'business': 0.7, 'car': 0.6966824644549763, 'domestic appliances': 0.6666666666666666, 'education': 0.4857142857142857, 'furniture/equipment': 0.7075471698113207, 'radio/TV': 0.7904191616766467, 'repairs': 0.6, 'vacation/others': 0.6}


In [14]:
validacao['sex'].replace(sex_enc, inplace=True)
validacao['housing'].replace(housing_enc, inplace=True)
validacao['purpose'].replace(purpose_enc, inplace=True)
validacao.head()

Unnamed: 0,age,sex,job,housing,savingAccount,checkingAccount,creditAmount,duration,purpose,risk
0,24,0.746411,2,0.741339,1.0,1.0,1800,18,0.790419,0
1,33,0.626374,2,0.741339,1.0,1.0,1927,24,0.485714,0
2,30,0.746411,3,0.565217,1.0,1.0,12680,21,0.696682,1
3,39,0.626374,2,0.741339,1.0,2.0,1188,21,0.7,1
4,44,0.746411,2,0.565217,1.0,1.0,5507,24,0.6,0


In [15]:
y_validacao = validacao['risk']
validacao = validacao.drop(columns=['risk'])
print(validacao.shape, y_validacao.shape)

(400, 9) (400,)


In [16]:
pred_validacao = clf.predict(validacao)

In [17]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_validacao, pred_validacao))
from sklearn.metrics import classification_report, plot_confusion_matrix
print(classification_report(y_validacao, pred_validacao))

[[118 156]
 [ 80  46]]
              precision    recall  f1-score   support

           0       0.60      0.43      0.50       274
           1       0.23      0.37      0.28       126

    accuracy                           0.41       400
   macro avg       0.41      0.40      0.39       400
weighted avg       0.48      0.41      0.43       400

