## Medidas de impureza

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

$$ Gini = 1 - \sum_i p_i ^ 2  $$

In [None]:
def gini(p):
   return (p)*(1 - (p)) + (1 - p)*(1 - (1-p))

$$ Entropia = - \sum_i p_i log_2 (p_i) $$

In [None]:
def entropy(p):
   return - p*np.log2(p) - (1 - p)*np.log2((1 - p))

$$ ErrorClasif = 1 - max (p_i) $$

In [None]:
def classification_error(p):
   return 1 - np.max([p, 1 - p])

In [None]:
x = np.arange(0.0, 1.0, 0.01)
ent = [entropy(p) if p != 0 else None for p in x]
scaled_ent = [e*0.5 if e else None for e in ent]
c_err = [classification_error(i) for i in x]

fig = plt.figure()
ax = plt.subplot(111)

for j, lab, ls, c, in zip(
      [ent, scaled_ent, gini(x), c_err],
      ['Entropy', 'Entropy (scaled)', 'Gini Impurity', 'Misclassification Error'],
      ['-', '-', '--', '-.'],
      ['lightgray', 'red', 'green', 'blue']):
   line = ax.plot(x, j, label=lab, linestyle=ls, lw=1, color=c)

ax.legend(loc='upper left', bbox_to_anchor=(0.01, 0.85),
         ncol=1, fancybox=True, shadow=False)

ax.axhline(y=0.5, linewidth=1, color='k', linestyle='--')
ax.axhline(y=1.0, linewidth=1, color='k', linestyle='--')

plt.ylim([0, 1.1])
plt.xlabel('p(j=1)')
plt.ylabel('Impurity Index')
plt.show()

## Árbol de decisión

Datos: [Marketing Bancario](https://archive.ics.uci.edu/ml/datasets/bank+marketing)

In [None]:
bank = pd.read_csv('datos/bank-full.csv', sep=';')
bank.head()

In [None]:
bank = bank.replace('yes', 1).replace('no', 0)

months = pd.DataFrame({
    'month': bank.month.sort_values().unique(),
    'month_no': [4, 8, 12, 2, 1, 7, 6, 3, 5, 11, 10, 9]
})
bank = bank.merge(months).drop('month', axis=1)

bank = bank.merge(pd.DataFrame({
    'education': bank.education.unique(),
    'edu': [1, 2, 3, np.nan]
})).drop('education', axis=1)

bank = bank.merge(pd.DataFrame({
    'poutcome': bank.poutcome.sort_values().unique(),
    'prev_out': [-1, np.nan, 1, 0]
})).drop('poutcome', axis=1)

bank = bank.drop('job', axis=1)

bank_dummies = pd.get_dummies(bank)
bank_dummies.sample(10)

bank_dummies = bank_dummies.dropna()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz 
from sklearn.metrics import confusion_matrix

In [None]:
arbol = DecisionTreeClassifier(max_depth=2)

X_train, X_test, y_train, y_test = train_test_split(
    bank_dummies.drop('y', axis=1), # X
    bank_dummies.y,  # y
    test_size=0.2, # porcentaje que será prueba
    random_state=42) # para fijar el aleatorio (reproducibilidad)

In [None]:
arbol_ent = arbol.fit(X_train, y_train)

In [None]:
confusion_matrix(y_test, arbol_ent.predict(X_test))

In [None]:
graf = export_graphviz(arbol_ent, out_file=None, 
             feature_names=X_train.columns,   
             filled=True, rounded=True,  
             special_characters=True) 

graph = graphviz.Source(graf)
graph

In [None]:
print("Score entrenamiento:", arbol_ent.score(X_train, y_train))
print("Score prueba:", arbol_ent.score(X_test, y_test))

In [None]:
score_ent = []
score_pru = []

for i in range(3, 30):
    arbol = DecisionTreeClassifier(max_depth=i)
    arbol_ent = arbol.fit(X_train, y_train)
    print("Score entrenamiento:", arbol_ent.score(X_train, y_train))
    print("Score prueba:", arbol_ent.score(X_test, y_test))
    score_ent.append(arbol_ent.score(X_train, y_train))
    score_pru.append(arbol_ent.score(X_test, y_test))

In [None]:
pd.DataFrame({
    'score_ent': score_ent,
    'score_pru': score_pru
}, index=range(3, 30)).plot()
plt.show()

## Bagging

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier


score_ent = []
score_pru = []

for i in range(3, 30):
    tree = DecisionTreeClassifier(max_depth=i)
    bag = BaggingClassifier(tree, n_estimators=30, n_jobs=-1)
    bag_ent = bag.fit(X_train, y_train)
    print("Score entrenamiento:", bag_ent.score(X_train, y_train))
    print("Score prueba:", bag_ent.score(X_test, y_test),'\n')
    score_ent.append(bag_ent.score(X_train, y_train))
    score_pru.append(bag_ent.score(X_test, y_test))


In [None]:
pd.DataFrame({
    'score_ent': score_ent,
    'score_pru': score_pru
}, index=range(3, 30)).plot()
plt.show()

In [None]:
tree = DecisionTreeClassifier()
bag = BaggingClassifier(tree, n_estimators=200, n_jobs=-1)

bag.fit(X_train, y_train)
bag_ent = bag.fit(X_train, y_train)
print("Score entrenamiento:", bag_ent.score(X_train, y_train))
print("Score prueba:", bag_ent.score(X_test, y_test))