# Random Forest

#### Importando bibliotecas

In [1]:
%matplotlib inline

import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics  import   accuracy_score 
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold

plt.style.use('ggplot')
pd.set_option('display.max_columns', 500) 

In [2]:
def results_holdout(clf, X_train, y_train, X_test, y_test):
    
    classifier = clf.fit(X_train, np.ravel(y_train))
    metrics = [f1_score, precision_score, recall_score, accuracy_score, roc_auc_score]
    
    pred_train = classifier.predict(X_train)
    pred_test = classifier.predict(X_test)

    prob_train = classifier.predict_proba(X_train)
    prob_train_true = pd.DataFrame(prob_train).iloc[:,1] #prob de ser 1
    prob_test = classifier.predict_proba(X_test)
    prob_test_true = pd.DataFrame(prob_test).iloc[:,1]
    
    results = pd.DataFrame()
    
    for m in metrics:
        if m == roc_auc_score:
                result_train = m(y_train, prob_train_true)
                result_test = m(y_test, prob_test_true)
        else:        
            result_train = m(y_train, pred_train)
            result_test = m(y_test, pred_test)
        
        dict_res = {'Treino': [result_train], 'Teste': [result_test]}
        res_aux = pd.DataFrame(dict_res, index = [str(m.__name__)])
        
        results = pd.concat([results, res_aux], axis = 0)
    
    
    train_res = pd.concat([y_train, pd.DataFrame(prob_train).iloc[:,1]], axis = 1)
    test_res = pd.concat([y_test, pd.DataFrame(prob_test).iloc[:,1]], axis = 1)
    train_res.columns = ['y_train', 'prob']
    test_res.columns = ['y_test', 'prob']
    
    train_prob_True1 = train_res[train_res.y_train == 1]
    train_prob_True0 = train_res[train_res.y_train == 0]
    
    test_prob_True1 = test_res[test_res.y_test == 1]
    test_prob_True0 = test_res[test_res.y_test == 0]
    
    dict_res_ks = {'Treino': [stats.ks_2samp(train_prob_True1.prob, train_prob_True0.prob).statistic], 
                   'Teste': [stats.ks_2samp(test_prob_True1.prob, test_prob_True0.prob).statistic]}
    
    res_aux_ks = pd.DataFrame(dict_res_ks, index = ['KS'])
    results = pd.concat([results, res_aux_ks], axis = 0)
    
    return results

def results_cv(clf, X_train, y_train, kcv):
    
    classifier = clf.fit(X_train, np.ravel(y_train))
    metrics = ['f1', 'precision', 'recall', 'accuracy', 'roc_auc']
    
    results = pd.DataFrame()
    for m in metrics:
        cv_results = cross_validate(clf, X_train, np.ravel(y_train), scoring = m, cv = kcv, return_train_score = True)
        dict_res = {'' + m + '_Treino': cv_results['train_score'], 
                    '' + m + '_Teste': cv_results['test_score']}
        res_aux = pd.DataFrame(dict_res)
        results = pd.concat([results, res_aux], axis = 1)
        
    results_mean = np.transpose(pd.DataFrame(results.mean(), columns=['media']))
    results = pd.concat([results, results_mean], axis = 0)
    return results


def grid_search(clf, X_train, y_train, params, score, cv):    
    grid = GridSearchCV(clf, params, scoring = score, cv = cv, return_train_score=True)
    grid_fitted = grid.fit(X_train, np.ravel(y_train))
    print ("Best score: %.4f" % grid_fitted.best_score_)
    print ("Best parameters: %s" % grid_fitted.best_params_)
    return grid_fitted, grid_fitted.best_estimator_, grid_fitted.cv_results_

In [3]:
def report_teste(predictions, alg_name):

    print('Resultados para o classificador {0}:'.format(alg_name))
    print(classification_report(y_teste, predictions), 
    print ("Acurácia para o treino é ", accuracy_score(y_teste,predictions)))
    
def report_treino(predictions, alg_name):

    print('Resultados para o classificador {0}:'.format(alg_name))
    print(classification_report(y_treino, predictions), 
    print ("Acurácia para o treino é ", accuracy_score(y_treino,predictions)))

#### Dados

In [4]:
dataset = pd.read_csv('C:\\Users\\Fabiel Fernando\\Desktop\\PROVA\\classificacao_Q4.csv')
dataset.columns
names = dataset.columns
names_index = names[0:100]
names_index

Index(['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10',
       'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20',
       'x21', 'x22', 'x23', 'x24', 'x25', 'x26', 'x27', 'x28', 'x29', 'x30',
       'x31', 'x32', 'x33', 'x34', 'x35', 'x36', 'x37', 'x38', 'x39', 'x40',
       'x41', 'x42', 'x43', 'x44', 'x45', 'x46', 'x47', 'x48', 'x49', 'x50',
       'x51', 'x52', 'x53', 'x54', 'x55', 'x56', 'x57', 'x58', 'x59', 'x60',
       'x61', 'x62', 'x63', 'x64', 'x65', 'x66', 'x67', 'x68', 'x69', 'x70',
       'x71', 'x72', 'x73', 'x74', 'x75', 'x76', 'x77', 'x78', 'x79', 'x80',
       'x81', 'x82', 'x83', 'x84', 'x85', 'x86', 'x87', 'x88', 'x89', 'x90',
       'x91', 'x92', 'x93', 'x94', 'x95', 'x96', 'x97', 'x98', 'x99'],
      dtype='object')

In [5]:
#Verificando a existência de missings
#dataset.apply(lambda x: x.isnull().sum())

In [6]:
dataset.head(5)

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36,x37,x38,x39,x40,x41,x42,x43,x44,x45,x46,x47,x48,x49,x50,x51,x52,x53,x54,x55,x56,x57,x58,x59,x60,x61,x62,x63,x64,x65,x66,x67,x68,x69,x70,x71,x72,x73,x74,x75,x76,x77,x78,x79,x80,x81,x82,x83,x84,x85,x86,x87,x88,x89,x90,x91,x92,x93,x94,x95,x96,x97,x98,x99,target
0,1.696199,-0.792598,-0.349427,-0.46456,3.187014,0.035976,1.033274,-1.504968,0.204693,1.691204,-0.148668,-4.074097,-0.032896,-0.663494,-0.386016,-0.237805,-1.510523,-1.570864,-0.368605,0.812503,0.549905,-0.73026,0.761423,1.128273,-1.76375,0.579692,-0.293674,0.2955,-0.427231,-0.295434,-2.626552,-0.888908,0.36011,-3.085644,-0.945316,-0.904486,1.072223,1.778115,-0.148051,0.634574,0.209628,0.561244,-0.586968,-3.702351,-0.649087,0.066648,0.521637,-0.318873,-0.964632,-0.068293,-1.941717,0.0113,-0.030974,1.666534,1.907174,0.454065,0.157899,-1.415378,-0.220428,-1.163591,0.643701,-0.593975,-0.23002,2.142668,-1.150896,1.980677,1.115755,0.511176,-0.526043,-0.492225,1.291322,-0.795223,1.292448,0.804562,0.82248,-1.205006,-0.280887,-1.364098,0.312,-1.925461,0.498012,0.371394,0.176175,0.54743,1.058247,0.503351,1.018997,0.221213,-0.419,-0.858737,-0.53436,1.488142,-0.686337,2.08497,-0.68514,-2.049451,2.015426,1.158477,-0.309441,-1.549833,4.0
1,-0.236696,-2.202342,0.024023,1.4977,-0.069758,-2.467088,1.126529,-0.570557,2.079251,-1.882632,-0.827576,1.005103,-0.137394,1.189628,-0.851586,-1.288871,-0.963559,1.227582,0.715197,0.520097,0.588903,-0.590111,-2.210356,1.022461,-1.039452,-0.241972,0.282824,0.001147,-1.621286,-1.81576,0.663234,-0.20891,0.113045,2.046566,0.761385,1.412045,2.094611,-0.286475,0.718189,-0.421027,1.182153,0.379603,-0.835262,0.937721,0.114378,-0.65173,-0.04716,3.589095,-0.486826,2.847869,0.162564,-0.039426,0.462479,-1.531158,-1.860289,0.45575,2.220489,1.212844,-1.32969,-1.452428,0.053086,-0.574263,-2.51865,-1.73764,-0.194589,0.648973,-0.342163,-0.508209,0.947281,-0.430554,0.661217,-1.936414,-1.698198,-3.313671,-0.183713,-0.549041,1.28062,2.177973,0.706155,-1.002186,-0.760492,0.39023,1.652978,-0.281058,-2.274763,-1.451749,-0.594344,1.292452,1.06612,0.036062,0.498207,0.405567,0.509564,1.374071,-0.016943,-0.42928,-0.895016,1.259566,-0.354139,0.806797,5.0
2,-0.436683,1.563816,-0.895999,-0.580425,0.31106,-0.187369,0.805249,-2.399522,-0.578818,1.586981,-1.941955,-0.596377,-0.489321,-1.030148,-0.485569,0.902347,0.107147,-0.780838,0.402332,-1.45017,-0.583627,-0.706544,-0.025883,-1.450107,2.118729,1.015845,0.166787,-0.04401,-0.360155,0.101155,-0.799201,-1.102617,2.115397,-2.361777,0.525674,-1.911165,0.123961,-0.417771,0.548105,-0.217684,-0.431924,-0.442644,-1.489144,-1.000744,0.862522,-0.563455,0.588636,0.010576,-0.456408,-1.428348,0.216525,1.29035,-1.09207,0.522418,2.553921,0.087687,1.755408,-1.382265,0.032006,0.680842,0.911192,0.50537,-0.741637,0.980315,2.35912,-0.380329,0.234811,2.287361,-0.568738,-1.93231,-1.912456,-1.829811,-0.589138,0.473086,-0.23706,-0.106093,-0.69006,-0.64096,-1.088658,-0.998397,-1.579437,-0.697638,-0.620487,-0.320028,1.390414,0.449638,0.300941,-0.512526,0.656667,0.265771,-2.630024,0.933578,-1.285978,0.503162,0.204829,-0.753835,0.290033,1.721487,1.304518,0.478903,3.0
3,1.425908,0.400055,-0.305038,-0.930251,-2.214549,1.763379,-0.239868,-2.058891,-1.006533,-2.156839,-0.81731,3.135035,-1.046031,2.035231,0.307369,-0.831289,-0.263652,-1.47907,-0.675276,-0.222479,-0.4411,0.343649,0.210042,-2.030159,0.636847,-2.268783,1.066813,1.486655,0.665269,1.207031,3.549965,-0.026904,1.027441,1.979429,1.133188,1.70945,1.04651,1.397032,0.177327,-0.402179,-0.054244,-0.578126,-0.055127,2.794188,0.528181,-0.140851,-0.320488,-0.552952,-2.406692,0.054562,0.886823,-0.419061,-0.272393,-2.141239,-0.114749,0.230638,-0.250862,1.116209,1.452902,0.927677,-0.136729,-0.873607,0.430335,0.82897,0.313719,0.378332,-0.586515,-1.448876,-0.149765,-0.958114,-1.478115,-2.388252,-1.569214,-2.755844,-1.098166,1.450431,1.134263,2.586703,-0.22475,-0.036701,2.264622,-0.0352,0.217302,0.038805,-0.604043,-1.798876,-2.307973,1.441341,2.31182,-0.947016,-0.260665,-0.849927,1.402768,0.393653,-1.466818,0.152257,-4.00495,0.676342,-1.927319,1.959032,8.0
4,-0.186156,-0.975764,0.59466,-1.18198,-1.443414,-0.797651,-1.252608,-0.060452,0.130702,-2.343517,0.892393,-0.533092,-0.760388,-0.702277,0.259456,3.732211,1.185647,2.046445,-1.378246,-0.733557,4.716702,0.229157,1.955133,1.917857,-1.783127,-0.839499,-1.811106,-0.405222,0.074332,2.034061,0.17922,-0.458617,-3.470883,0.561481,0.492969,1.310855,0.50579,-1.135986,-0.696156,0.815568,-0.266634,0.245124,1.244601,0.930504,-2.423524,-0.217978,-0.250712,-0.180181,1.57962,-1.239677,-0.91766,1.345773,0.545109,2.444263,-1.24419,0.446668,0.178714,-0.714363,0.310813,-4.723429,1.02538,0.567891,-1.21582,0.061255,1.798139,-0.254473,0.091907,0.680257,1.232538,-0.482364,1.012526,-0.554645,0.451229,0.484063,2.46672,0.102488,-0.574971,-2.885352,0.91171,-0.846603,0.850602,2.22244,-1.981894,0.156248,-2.788302,-0.067919,1.352606,-1.878879,-0.943184,-0.185896,1.098563,-1.444435,-1.818126,0.446574,0.239328,0.802939,-2.035289,-1.433793,-0.218596,0.619317,9.0


In [7]:
print("Dimensão dos nossos dados:\n", 
     dataset.shape)
#print("Tipo de variáveis:\n",
#     dataset.dtypes)

Dimensão dos nossos dados:
 (1500, 101)


#### Pocentagem da variável resposta

In [8]:
resposta = dataset['target']
count = pd.DataFrame(resposta.value_counts())
percent = pd.DataFrame(resposta.value_counts(normalize = True)*100)
table = pd.concat([count, percent], axis = 1)
table.columns = ['# target', '% target']
table

Unnamed: 0,# target,% target
2.0,153,10.2
8.0,153,10.2
3.0,152,10.133333
0.0,150,10.0
4.0,150,10.0
1.0,149,9.933333
6.0,149,9.933333
9.0,149,9.933333
5.0,148,9.866667
7.0,147,9.8


In [9]:
#Descritiva de algumas variáveis
#dataset.describe()

#### Treino e Teste

In [10]:
feature_space = dataset.iloc[:, dataset.columns != 'target']
feature_class = dataset.iloc[:, dataset.columns == 'target']


X_treino, X_teste, y_treino, y_teste = train_test_split(feature_space,
                                                                    feature_class,
                                                                    test_size = 0.30, 
                                                                    random_state = 42)

In [11]:
# Limpar conjuntos de teste para evitar futuras mensagens de aviso
y_treino = y_treino.values.ravel() 
y_teste = y_teste.values.ravel() 

## Ajustando Random Forest

In [13]:
#criterio = "gini"
criterio = "entropy"

In [14]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(criterion = criterio,
                                       max_depth=3, #máximo de divisões para todas as árvores na floresta
                                       min_samples_leaf = 5, 
                                       min_samples_split = 10,   
                                       n_estimators=100, 
                                       max_features='auto', #máximo de variáveis que serão usados na divisão de nós
                                       oob_score=True, 
                                       random_state=42, 
                                       n_jobs=-1)
classifier.fit(X_treino, y_treino)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

#### Precisão do classificador

In [15]:
pred_test = classifier.predict(X_teste)
pred_train = classifier.predict(X_treino)

#### Tabela com cálculo de vária métricas conjunto treino

In [16]:
report_treino(pred_train,'treino')

Resultados para o classificador treino:
Acurácia para o treino é  0.6361904761904762
             precision    recall  f1-score   support

        0.0       0.65      0.83      0.73       103
        1.0       0.56      0.55      0.56        98
        2.0       0.76      0.35      0.48       111
        3.0       0.70      0.69      0.69       105
        4.0       0.58      0.54      0.56       104
        5.0       0.71      0.57      0.63        97
        6.0       0.60      0.64      0.62        98
        7.0       0.61      0.68      0.64       111
        8.0       0.64      0.68      0.66       106
        9.0       0.63      0.82      0.71       117

avg / total       0.65      0.64      0.63      1050
 None


#### Tabela com cálculo de vária métricas conjunto teste

In [17]:
report_teste(pred_test,'treino')

Resultados para o classificador treino:
Acurácia para o treino é  0.5755555555555556
             precision    recall  f1-score   support

        0.0       0.64      0.94      0.76        47
        1.0       0.46      0.51      0.49        51
        2.0       0.55      0.29      0.37        42
        3.0       0.65      0.70      0.67        47
        4.0       0.46      0.37      0.41        46
        5.0       0.76      0.57      0.65        51
        6.0       0.69      0.43      0.53        51
        7.0       0.53      0.58      0.55        36
        8.0       0.66      0.66      0.66        47
        9.0       0.41      0.75      0.53        32

avg / total       0.59      0.58      0.57       450
 None


## Ajustando o classificador com Grid Search

In [20]:
# random state para reproducibilidade
fit_rf = RandomForestClassifier(random_state=42)

In [21]:
np.random.seed(42)
cv_kfold = KFold(10, shuffle = False)

param_grid = {
                "n_estimators" : [9, 18, 27, 36, 45, 54, 63],
                "max_depth" : [1, 5, 10, 15, 20, 25, 30],
                "min_samples_leaf" : [1, 2, 4, 6, 8, 10]
             }

In [22]:
cv_rf = GridSearchCV(fit_rf,
                     cv = cv_kfold,
                     param_grid = param_grid,
                     scoring = 'accuracy',
                     return_train_score= True,
                     n_jobs = 3)

In [23]:
gridcv_rf = cv_rf.fit(X_treino, y_treino)

In [24]:
modelo_randomforest = gridcv_rf.best_estimator_

In [25]:
gridcv_rf.best_score_

0.7066666666666667

In [26]:
pd.DataFrame(gridcv_rf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,0.053431,0.004062,0.002467,0.002378,1,1,9,"{'max_depth': 1, 'min_samples_leaf': 1, 'n_est...",0.457143,0.419048,0.352381,0.428571,0.333333,0.352381,0.285714,0.295238,0.314286,0.247619,0.348571,0.064481,289,0.387302,0.440212,0.396825,0.407407,0.385185,0.374603,0.364021,0.376720,0.360847,0.368254,0.386138,0.022723
1,0.097073,0.005957,0.003687,0.001523,1,1,18,"{'max_depth': 1, 'min_samples_leaf': 1, 'n_est...",0.447619,0.428571,0.457143,0.352381,0.428571,0.314286,0.323810,0.333333,0.342857,0.295238,0.372381,0.058001,271,0.429630,0.441270,0.461376,0.431746,0.429630,0.369312,0.427513,0.421164,0.411640,0.396825,0.422011,0.023836
2,0.140210,0.002014,0.006287,0.001364,1,1,27,"{'max_depth': 1, 'min_samples_leaf': 1, 'n_est...",0.457143,0.447619,0.466667,0.333333,0.380952,0.295238,0.342857,0.323810,0.352381,0.333333,0.373333,0.058678,265,0.415873,0.480423,0.443386,0.430688,0.414815,0.354497,0.445503,0.400000,0.416931,0.409524,0.421164,0.031309
3,0.183043,0.003091,0.008233,0.001711,1,1,36,"{'max_depth': 1, 'min_samples_leaf': 1, 'n_est...",0.457143,0.380952,0.457143,0.323810,0.380952,0.304762,0.333333,0.333333,0.361905,0.419048,0.375238,0.051675,253,0.409524,0.427513,0.430688,0.431746,0.408466,0.365079,0.440212,0.417989,0.448677,0.427513,0.420741,0.022031
4,0.228920,0.002781,0.010893,0.002515,1,1,45,"{'max_depth': 1, 'min_samples_leaf': 1, 'n_est...",0.428571,0.380952,0.447619,0.333333,0.419048,0.342857,0.342857,0.314286,0.361905,0.352381,0.372381,0.042687,271,0.421164,0.442328,0.459259,0.449735,0.430688,0.360847,0.439153,0.429630,0.448677,0.423280,0.430476,0.025950
5,0.270407,0.002817,0.011671,0.001864,1,1,54,"{'max_depth': 1, 'min_samples_leaf': 1, 'n_est...",0.419048,0.409524,0.419048,0.333333,0.409524,0.342857,0.333333,0.371429,0.361905,0.342857,0.374286,0.034615,259,0.402116,0.415873,0.462434,0.445503,0.420106,0.367196,0.434921,0.438095,0.431746,0.427513,0.424550,0.024726
6,0.315651,0.005703,0.013005,0.001683,1,1,63,"{'max_depth': 1, 'min_samples_leaf': 1, 'n_est...",0.380952,0.409524,0.419048,0.323810,0.409524,0.361905,0.295238,0.361905,0.361905,0.304762,0.362857,0.041612,283,0.401058,0.392593,0.440212,0.428571,0.424339,0.361905,0.424339,0.423280,0.410582,0.422222,0.412910,0.021462
7,0.049641,0.004064,0.002533,0.001856,1,2,9,"{'max_depth': 1, 'min_samples_leaf': 2, 'n_est...",0.457143,0.419048,0.352381,0.428571,0.333333,0.352381,0.285714,0.295238,0.314286,0.247619,0.348571,0.064481,289,0.387302,0.440212,0.396825,0.407407,0.385185,0.374603,0.364021,0.376720,0.360847,0.368254,0.386138,0.022723
8,0.093674,0.002742,0.004410,0.002393,1,2,18,"{'max_depth': 1, 'min_samples_leaf': 2, 'n_est...",0.447619,0.428571,0.457143,0.352381,0.428571,0.314286,0.323810,0.333333,0.342857,0.295238,0.372381,0.058001,271,0.429630,0.441270,0.461376,0.431746,0.429630,0.369312,0.427513,0.421164,0.411640,0.396825,0.422011,0.023836
9,0.136141,0.004396,0.008513,0.005375,1,2,27,"{'max_depth': 1, 'min_samples_leaf': 2, 'n_est...",0.457143,0.447619,0.466667,0.333333,0.380952,0.295238,0.342857,0.323810,0.352381,0.333333,0.373333,0.058678,265,0.415873,0.480423,0.443386,0.430688,0.414815,0.354497,0.445503,0.400000,0.416931,0.409524,0.421164,0.031309


#### Variáveis Importantes

In [27]:
def variable_importance(fit):
    importances = fit.feature_importances_
    indices = np.argsort(importances)[::-1]
    return {'importance': importances,
            'index': indices}

In [28]:
var_imp_rf = variable_importance(modelo_randomforest)
importances_rf = var_imp_rf['importance']
indices_rf = var_imp_rf['index']

In [29]:
def print_var_importance(importance, indices, name_index):
    print("Ranking das variáveis mais importantes:")
    for f in range(0, 5):
        i = f
        print("{0}. A variável '{1}' tem uma diminuição média na impureza de {2:.5f}"
              .format(f + 1,
                      names_index[indices[i]],
                      importance[indices[f]]))

In [30]:
print_var_importance(importances_rf, indices_rf, names_index)

Ranking das variáveis mais importantes:
1. A variável 'x57' tem uma diminuição média na impureza de 0.03223
2. A variável 'x69' tem uma diminuição média na impureza de 0.03221
3. A variável 'x41' tem uma diminuição média na impureza de 0.03054
4. A variável 'x79' tem uma diminuição média na impureza de 0.03018
5. A variável 'x35' tem uma diminuição média na impureza de 0.02897


## Validação Cruzada

### K - fold

In [31]:
X = dataset.iloc[:, 0:100].values
y = dataset['target'].astype('category')

In [32]:
from sklearn import model_selection
kfold = model_selection.KFold(n_splits=10, random_state=42)
model = RandomForestClassifier()
scoring = 'accuracy'
results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
results.mean(), results.std()

(0.6639999999999999, 0.04753478258660237)

In [41]:
results

array([0.66666667, 0.69333333, 0.66      , 0.72666667, 0.63333333,
       0.68666667, 0.62      , 0.65333333, 0.73333333, 0.56666667])

### LOOCV

In [33]:
model = RandomForestClassifier()
accuracies = cross_val_score(model, X=X, y=y, cv=LeaveOneOut())
accuracies.mean()

0.672

### Repeat CV

In [34]:
from sklearn.model_selection import RepeatedKFold
cv_repeat = RepeatedKFold(n_splits=6, n_repeats=3, random_state=42)
model = RandomForestClassifier()
accuracies = cross_val_score(model, X=X, y=y, cv=cv_repeat)
accuracies.mean()

0.6542222222222221

### Separando as k primeiras observações para treino e o restante para teste

In [35]:
X_treino = dataset.iloc[0:499, 0:99].values
y_treino = dataset.iloc[0:499, 100].values


X_teste = dataset.iloc[500:1500, 0:99].values
y_teste = dataset.iloc[500:1500, 100].values

In [36]:
#criterio = 'gini'
criterio = 'entropy'

In [37]:
clf = RandomForestClassifier(criterion = criterio,
                                       max_depth=3,
                                       min_samples_leaf = 1, 
                                       min_samples_split = 10,   
                                       n_estimators=100, 
                                       max_features='auto', 
                                       oob_score=True, 
                                       random_state=42, 
                                       n_jobs=-1)
clf.fit(X_treino, y_treino)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

#### Precisão do classificador no Random Forest

In [38]:
pred_teste = clf.predict(X_teste)
pred_treino = clf.predict(X_treino)

#### Métricas Treino

In [39]:
 report_treino(pred_treino, 'Random Forest')

Resultados para o classificador Random Forest:
Acurácia para o treino é  0.687374749498998
             precision    recall  f1-score   support

        0.0       0.59      0.85      0.70        55
        1.0       0.65      0.77      0.70        48
        2.0       0.89      0.38      0.53        45
        3.0       0.68      0.81      0.74        58
        4.0       0.81      0.44      0.57        48
        5.0       0.83      0.54      0.66        46
        6.0       0.60      0.79      0.68        47
        7.0       0.74      0.67      0.70        51
        8.0       0.90      0.63      0.74        41
        9.0       0.64      0.87      0.74        60

avg / total       0.72      0.69      0.68       499
 None


#### Métricas Teste

In [40]:
report_teste(pred_teste, 'Random Forest')

Resultados para o classificador Random Forest:
Acurácia para o treino é  0.57
             precision    recall  f1-score   support

        0.0       0.55      0.80      0.65        95
        1.0       0.56      0.58      0.57       101
        2.0       0.65      0.29      0.40       108
        3.0       0.56      0.76      0.64        94
        4.0       0.63      0.24      0.34       102
        5.0       0.68      0.42      0.52       102
        6.0       0.55      0.75      0.64       102
        7.0       0.56      0.52      0.54        96
        8.0       0.73      0.55      0.63       111
        9.0       0.46      0.88      0.60        89

avg / total       0.60      0.57      0.55      1000
 None
