In [27]:
# Instruções basiconas de sempre.

# Lidar com dataframes.
import numpy as np
import pandas as pd

# Lidar com gráficos.
import matplotlib.pyplot as plt
import seaborn as sns
from plotting import (multiple_histograms_plot,
                      bar_plot_with_categorical,
                      plot_confusion_matrix,
                      plot_confusion_matrix_2,
                      plot_roc)

# Lidar com preparação de dados.
from data_prep import data_prep as dp # Eu que fiz esse modulinho ("uuuuuuuuuma bosts!").
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (StandardScaler
                                   , MinMaxScaler)

from sklearn.tree import DecisionTreeClassifier, export

from sklearn.model_selection import (train_test_split
                                     , cross_val_score
                                     , StratifiedKFold
                                     , cross_validate
                                     , GridSearchCV)

# Lidar com validação de modelos.
from sklearn.metrics import (confusion_matrix
                             , accuracy_score
                             , classification_report)

pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)


In [28]:
nomeDaBase = "./data/classificacao_Q1.csv"
df = pd.read_csv(nomeDaBase, encoding="utf-8")

print(df.shape)
df.head()

(1800, 13)


Unnamed: 0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,target
0,-1.79796,-1.75901,-2.14257,-0.01878,-0.58601,-3.08224,-1.71369,1.41805,2.29783,1.11745,-3.43483,1.94926,0
1,-2.78612,-1.60622,-4.05247,3.33814,1.44255,-3.95343,-3.32328,0.71019,2.31973,0.57596,-4.22278,0.51171,0
2,-1.31957,2.61125,-0.99663,0.34752,0.85253,4.22652,0.84765,0.8853,-1.0146,1.03488,-0.84593,2.64307,1
3,-3.26219,-2.1151,0.68081,-0.4929,-2.45122,-2.37668,-0.00338,3.95559,1.42793,1.6871,-1.35533,2.82004,0
4,5.26181,1.85527,-2.29242,-0.86763,0.11625,-2.17675,-1.72692,-2.34726,1.98107,-4.58177,-2.26649,-3.33875,1


In [29]:
x = df.drop("target", axis=1)
y = df["target"]

In [30]:
classificador = DecisionTreeClassifier()

cross = cross_validate(classificador, x.values, y, cv=5, scoring="neg_log_loss", return_train_score=True, return_estimator=True, n_jobs=-1)

for key in ['test_score', 'train_score']:
    print(f"""{key}: {round(-cross[key].mean(), 3)}""")

print("\n")

dct_importance = {}

for i, model in enumerate(cross['estimator']):
    dct_importance[i] = model.feature_importances_

print("Feature importance:")
dct_importance = pd.DataFrame.from_dict(dct_importance, orient='index', columns=x.columns)
dct_importance.loc["mean", :] = dct_importance.apply(np.mean)
dct_importance

test_score: 3.569
train_score: 0.0


Feature importance:


Unnamed: 0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12
0,0.028704,0.135804,0.016999,0.423504,0.005772,0.031341,0.035898,0.218782,0.03656,0.037711,0.009645,0.01928
1,0.02508,0.250697,0.003274,0.129793,0.021647,0.02963,0.032461,0.413143,0.014215,0.046101,0.01485,0.01911
2,0.045535,0.155614,0.011841,0.417719,0.010952,0.020499,0.037812,0.202967,0.018814,0.045071,0.013189,0.019987
3,0.020242,0.130343,0.009781,0.423898,0.028447,0.027299,0.03078,0.228897,0.012936,0.045291,0.018044,0.024042
4,0.025432,0.229674,0.010175,0.135187,0.031389,0.026996,0.027311,0.436139,0.022819,0.028648,0.0094,0.016831
mean,0.028999,0.180426,0.010414,0.30602,0.019641,0.027153,0.032852,0.299985,0.021069,0.040564,0.013026,0.01985


## Tunning

In [42]:
classificador = DecisionTreeClassifier()
scoring = "neg_log_loss"

params = {"criterion":["gini", "entropy"]
          , "splitter":["best", "random"]
          , "max_depth":[5, 10, 20]
          , "max_features": ["auto", "sqrt", "log2"]
          , "random_state":[0, 42]}

grid_search = GridSearchCV(estimator=classificador, param_grid=params,
                           scoring=scoring, cv=3, n_jobs=-1)

grid_search = grid_search.fit(x, y)
print(grid_search.best_params_)

grid_search.score(x, y)

{'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'random_state': 0, 'splitter': 'random'}


-0.4062044703641675

In [39]:
classificador = DecisionTreeClassifier(**grid_search.best_params_, random_state=0)

cross = cross_validate(classificador, x.values, y, cv=5, scoring="neg_log_loss", return_train_score=True, return_estimator=True, n_jobs=-1)

for key in ['test_score', 'train_score']:
    print(f"""{key}: {round(-cross[key].mean(), 3)}""")

print("\n")

dct_importance = {}

for i, model in enumerate(cross['estimator']):
    dct_importance[i] = model.feature_importances_

print("Feature importance:")
dct_importance = pd.DataFrame.from_dict(dct_importance, orient='index', columns=x.columns)
dct_importance.loc["mean", :] = dct_importance.apply(np.mean)
dct_importance

test_score: 3.025
train_score: 0.049


Feature importance:


Unnamed: 0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12
0,0.048356,0.052216,0.015553,0.131561,0.049305,0.099104,0.021967,0.419101,0.022274,0.066676,0.031863,0.042025
1,0.052859,0.095584,0.022314,0.091225,0.062351,0.065786,0.027563,0.391257,0.030703,0.08926,0.025037,0.04606
2,0.091,0.093301,0.016944,0.066788,0.069779,0.061933,0.03048,0.370791,0.039131,0.067199,0.045581,0.047073
3,0.034186,0.229545,0.037754,0.097089,0.044252,0.073386,0.032007,0.263278,0.031181,0.073906,0.021068,0.062349
4,0.09543,0.139457,0.008034,0.098442,0.037614,0.054661,0.035231,0.359446,0.025388,0.064209,0.026672,0.055415
mean,0.064366,0.122021,0.02012,0.097021,0.05266,0.070974,0.02945,0.360774,0.029736,0.07225,0.030044,0.050584
