## Árvores de Decisão - Bank Numeric dataset

### 1. Importando bibliotecas

In [3]:
# Manipulação de Dados
import pandas as pd

# Treino e teste
from sklearn.model_selection import train_test_split

# Árvore de Decisão
from sklearn import tree

import os

### 2. Carregando base de dados

In [4]:
file_dir = os.getcwd() + '\datasets\\bank-numeric.csv'
df = pd.read_csv(file_dir)
df.head()

Unnamed: 0,age,balance,duration,campaign,previous,default_cat,housing_cat,loan_cat,recent_pdays,deposit_cat,...,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,poutcome_failure,poutcome_success,poutcome_unknown
0,59,2343,1042,1,0,0,1,0,0.0001,1,...,0,1,0,0,1,0,0,0,0,1
1,56,45,1467,1,0,0,0,0,0.0001,1,...,0,1,0,0,1,0,0,0,0,1
2,41,1270,1389,1,0,0,1,0,0.0001,1,...,0,1,0,0,1,0,0,0,0,1
3,55,2476,579,1,0,0,1,0,0.0001,1,...,0,1,0,0,1,0,0,0,0,1
4,54,184,673,2,0,0,0,0,0.0001,1,...,0,1,0,0,0,1,0,0,0,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 27 columns):
age                    11162 non-null int64
balance                11162 non-null int64
duration               11162 non-null int64
campaign               11162 non-null int64
previous               11162 non-null int64
default_cat            11162 non-null int64
housing_cat            11162 non-null int64
loan_cat               11162 non-null int64
recent_pdays           11162 non-null float64
deposit_cat            11162 non-null int64
job_blue-collar        11162 non-null int64
job_entrepreneur       11162 non-null int64
job_other              11162 non-null int64
job_pink-collar        11162 non-null int64
job_self-employed      11162 non-null int64
job_technician         11162 non-null int64
job_white-collar       11162 non-null int64
marital_divorced       11162 non-null int64
marital_married        11162 non-null int64
marital_single         11162 non-null int64
education

deposit_cat = variável de saída (se foi feito o depósito ou não)

### 3. Treinamento e Validação do Modelo

Separando as features da classe de saída

In [6]:
data = df.drop('deposit_cat', 1)
target = df['deposit_cat']

Dividindo os dados em treino e teste

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3)
print('='*45)
print('Dimensões dos dataframes de treino e teste')
print('='*45)
print('------------TREINO----------TESTE------------')
print('----X:    {}      {}'.format(X_train.shape, X_test.shape))
print('----y:    {}         {}'.format(y_train.shape, y_test.shape))
print('='*45)

Dimensões dos dataframes de treino e teste
------------TREINO----------TESTE------------
----X:    (7813, 26)      (3349, 26)
----y:    (7813,)         (3349,)


Cria função para treinar o modelo de árvore de decisão variando o parâmetro max_depth

In [8]:
def compara_modelos(maxdepth):
    
    if maxdepth == 0:
        dt = tree.DecisionTreeClassifier(random_state=1)
    else:
        dt = tree.DecisionTreeClassifier(random_state=1, max_depth=maxdepth)
    
    dt.fit(X_train, y_train)
    train_score = dt.score(X_train, y_train)
    test_score = dt.score(X_test, y_test)
    
    return train_score, test_score

In [9]:
compara_modelos(2)

(0.7495200307180341, 0.7473872797850104)

In [12]:
print('{:10} {:20} {:20}'.format('depth', 'Training score','Testing score'))
print('{:10} {:20} {:20}'.format('-----', '--------------','-------------'))
print('{:1}         {} '.format(2,str(compara_modelos(2))))
print('{:1}         {} '.format(3,str(compara_modelos(3))))
print('{:1}         {} '.format(4,str(compara_modelos(4))))
print('{:1}         {} '.format(10,str(compara_modelos(10))))
print('{:1}         {} '.format(15,str(compara_modelos(15))))
print('{:1}         {} '.format('Full',str(compara_modelos(0))))

depth      Training score       Testing score       
-----      --------------       -------------       
2         (0.7495200307180341, 0.7473872797850104) 
3         (0.7651350313579931, 0.7536578083009854) 
4         (0.7817739664661462, 0.7769483427888922) 
10         (0.8672724945603482, 0.7885936100328457) 
15         (0.9516190963778318, 0.755747984472977) 
Full         (1.0, 0.7476858763810093) 


### 4. Feature importance

Verificando a importancia de cada feature no resultado final (target)

In [13]:
dt = tree.DecisionTreeClassifier(max_depth=4)
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [18]:
fi = dt.feature_importances_
features = df.columns

l = len(df.columns)
for i in range(0,len(df.columns)-1):
    print('{:.<20} {:3}'.format(features[i],fi[i]))

age................. 0.0
balance............. 0.001243016676036646
duration............ 0.6520020870289662
campaign............ 0.0
previous............ 0.06684749897266237
default_cat......... 0.0
housing_cat......... 0.08960803749892239
loan_cat............ 0.0011301704742716341
recent_pdays........ 0.0
deposit_cat......... 0.0
job_blue-collar..... 0.0
job_entrepreneur.... 0.0
job_other........... 0.0
job_pink-collar..... 0.0
job_self-employed... 0.0
job_technician...... 0.0
job_white-collar.... 0.0
marital_divorced.... 0.0
marital_married..... 0.0
marital_single...... 0.0
education_primary... 0.0
education_secondary. 0.0
education_tertiary.. 0.0
education_unknown... 0.0
poutcome_failure.... 0.1891691893491407
poutcome_success.... 0.0
