In [1]:
# Instruções basiconas de sempre.

# Lidar com dataframes.
import numpy as np
import pandas as pd

# Lidar com gráficos.
import matplotlib.pyplot as plt
import seaborn as sns
from plotting import (multiple_histograms_plot,
                      bar_plot_with_categorical,
                      plot_confusion_matrix,
                      plot_confusion_matrix_2,
                      plot_roc)

# Lidar com preparação de dados.
from data_prep import data_prep as dp # Eu que fiz esse modulinho ("uuuuuuuuuma bosts!").
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (StandardScaler
                                   , MinMaxScaler)

from sklearn.tree import DecisionTreeClassifier, export

from sklearn.model_selection import (train_test_split
                                     , cross_val_score
                                     , StratifiedKFold
                                     , cross_validate
                                     , GridSearchCV)

# Lidar com validação de modelos.
from sklearn.metrics import (confusion_matrix
                             , accuracy_score
                             , classification_report)

pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)




In [2]:
nomeDaBase = "./data/classificacao_Q1.csv"
df = pd.read_csv(nomeDaBase, encoding="utf-8")

print(df.shape)
df.head()

(1800, 13)


Unnamed: 0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,target
0,-1.79796,-1.75901,-2.14257,-0.01878,-0.58601,-3.08224,-1.71369,1.41805,2.29783,1.11745,-3.43483,1.94926,0
1,-2.78612,-1.60622,-4.05247,3.33814,1.44255,-3.95343,-3.32328,0.71019,2.31973,0.57596,-4.22278,0.51171,0
2,-1.31957,2.61125,-0.99663,0.34752,0.85253,4.22652,0.84765,0.8853,-1.0146,1.03488,-0.84593,2.64307,1
3,-3.26219,-2.1151,0.68081,-0.4929,-2.45122,-2.37668,-0.00338,3.95559,1.42793,1.6871,-1.35533,2.82004,0
4,5.26181,1.85527,-2.29242,-0.86763,0.11625,-2.17675,-1.72692,-2.34726,1.98107,-4.58177,-2.26649,-3.33875,1


In [3]:
x = df.drop("target", axis=1)
y = df["target"]

In [5]:
classificador = DecisionTreeClassifier(criterion='entropy')

cross = cross_validate(classificador, x.values, y, cv=10, scoring="neg_log_loss", return_train_score=True, return_estimator=True, n_jobs=-1)

for key in ['test_score', 'train_score']:
    print(f"""{key}: {round(-cross[key].mean(), 3)}""")

print("\n")

dct_importance = {}

for i, model in enumerate(cross['estimator']):
    dct_importance[i] = model.feature_importances_

print("Feature importance:")
dct_importance = pd.DataFrame.from_dict(dct_importance, orient='index', columns=x.columns)
dct_importance.loc["mean", :] = dct_importance.apply(np.mean)
dct_importance

test_score: 3.089
train_score: 0.0


Feature importance:


Unnamed: 0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12
0,0.031645,0.19205,0.004696,0.333331,0.024175,0.026722,0.031399,0.216498,0.005062,0.053914,0.041664,0.038845
1,0.010907,0.187316,0.002044,0.212695,0.038938,0.020245,0.01368,0.353968,0.038942,0.053011,0.038196,0.030058
2,0.024819,0.174523,0.015047,0.342015,0.022118,0.02917,0.02176,0.230078,0.033389,0.045851,0.031557,0.029672
3,0.04068,0.1749,0.012132,0.223571,0.015086,0.02608,0.033301,0.322364,0.030971,0.041081,0.016757,0.063077
4,0.017755,0.184709,0.004935,0.316567,0.016004,0.026767,0.034696,0.229211,0.049029,0.068979,0.021014,0.030335
5,0.048292,0.183645,0.009051,0.210113,0.023728,0.052797,0.049465,0.311936,0.008514,0.050265,0.011295,0.040898
6,0.048025,0.230957,0.003876,0.163073,0.024262,0.022934,0.042065,0.330565,0.035803,0.055024,0.027883,0.015531
7,0.038584,0.176462,0.007314,0.240591,0.019218,0.024595,0.0533,0.330387,0.025945,0.03157,0.019063,0.03297
8,0.049584,0.18697,0.00844,0.225968,0.008591,0.038918,0.05138,0.31449,0.02242,0.038761,0.031605,0.022872
9,0.047384,0.170123,0.009732,0.210495,0.018846,0.046785,0.018351,0.357106,0.037487,0.063142,0.007732,0.012817


## Tunning

In [42]:
classificador = DecisionTreeClassifier()
scoring = "neg_log_loss"

params = {"criterion":["gini", "entropy"]
          , "splitter":["best", "random"]
          , "max_depth":[5, 10, 20]
          , "max_features": ["auto", "sqrt", "log2"]
          , "random_state":[0, 42]}

grid_search = GridSearchCV(estimator=classificador, param_grid=params,
                           scoring=scoring, cv=3, n_jobs=-1)

grid_search = grid_search.fit(x, y)
print(grid_search.best_params_)

grid_search.score(x, y)

{'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'random_state': 0, 'splitter': 'random'}


-0.4062044703641675

In [39]:
classificador = DecisionTreeClassifier(**grid_search.best_params_, random_state=0)

cross = cross_validate(classificador, x.values, y, cv=5, scoring="neg_log_loss", return_train_score=True, return_estimator=True, n_jobs=-1)

for key in ['test_score', 'train_score']:
    print(f"""{key}: {round(-cross[key].mean(), 3)}""")

print("\n")

dct_importance = {}

for i, model in enumerate(cross['estimator']):
    dct_importance[i] = model.feature_importances_

print("Feature importance:")
dct_importance = pd.DataFrame.from_dict(dct_importance, orient='index', columns=x.columns)
dct_importance.loc["mean", :] = dct_importance.apply(np.mean)
dct_importance

test_score: 3.025
train_score: 0.049


Feature importance:


Unnamed: 0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12
0,0.048356,0.052216,0.015553,0.131561,0.049305,0.099104,0.021967,0.419101,0.022274,0.066676,0.031863,0.042025
1,0.052859,0.095584,0.022314,0.091225,0.062351,0.065786,0.027563,0.391257,0.030703,0.08926,0.025037,0.04606
2,0.091,0.093301,0.016944,0.066788,0.069779,0.061933,0.03048,0.370791,0.039131,0.067199,0.045581,0.047073
3,0.034186,0.229545,0.037754,0.097089,0.044252,0.073386,0.032007,0.263278,0.031181,0.073906,0.021068,0.062349
4,0.09543,0.139457,0.008034,0.098442,0.037614,0.054661,0.035231,0.359446,0.025388,0.064209,0.026672,0.055415
mean,0.064366,0.122021,0.02012,0.097021,0.05266,0.070974,0.02945,0.360774,0.029736,0.07225,0.030044,0.050584
