In [103]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.combine import SMOTETomek
import graphviz 
from sklearn.tree import export_text

In [83]:
#Leemos el CSV
df = pd.read_csv('poker-hand-tiny.csv')
print("Contenido de data set: ", df.shape)
print("Cantidad por clase:\n", df.Clase.value_counts())

Contenido de data set:  (300000, 11)
Cantidad por clase:
 0    150289
1    126833
2     14395
3      6238
4      1156
5       598
6       423
7        62
8         4
9         2
Name: Clase, dtype: int64


In [84]:
#Limpiamos el CSV
for i in range(8,10):
    #Creamos lista con los indices a eliminar
    idmemor = df[df['Clase']== i].index
    #Eliminamos los indices del csv
    df = df.drop(idmemor)


In [93]:
#Define cada dataset
y = df.iloc[:,-1]
x = df.iloc[:,0:10]

In [96]:
# CREACIÓN DATOS SINTÉTICOS
smote = SMOTETomek(sampling_strategy='auto')
X_SMOTE, Y_SMOTE = smote.fit_sample(x, y)
print("Antes de SMOTE:", Counter(y),"\nDespués de SMOTE: ", Counter(Y_SMOTE))

Antes de SMOTE: Counter({0: 150289, 1: 126833, 2: 14395, 3: 6238, 4: 1156, 5: 598, 6: 423, 7: 62}) 
Después de SMOTE:  Counter({7: 150289, 5: 150286, 6: 150284, 4: 150282, 3: 150188, 2: 149737, 0: 134387, 1: 134003})


In [100]:
X_train, X_test, Y_train, Y_test = train_test_split(X_SMOTE, Y_SMOTE, test_size = 0.33, random_state = 324)

In [98]:
#ÁRBOL DE DECISIÓN 
arboldecision = DecisionTreeClassifier()
parametros = {'min_samples_split': [2, 20, 30, 40], 'max_depth': [1,10, 20, 30, 37, 40]}

In [44]:
# VALIDACIÓN CRUZADA - Grid Search
clf = GridSearchCV(arboldecision, parametros, cv = 3)
clf.fit(X_train, Y_train)
clf.cv_results_
resultados = pd.DataFrame(clf.cv_results_)
means = clf.cv_results_['mean_test_score']
for mean, parametros in zip(means,  clf.cv_results_['params']):
    print("%0.3f for %r" %(mean, parametros))

print()


0.152 for {'max_depth': 1, 'min_samples_split': 2}
0.152 for {'max_depth': 1, 'min_samples_split': 20}
0.152 for {'max_depth': 1, 'min_samples_split': 30}
0.152 for {'max_depth': 1, 'min_samples_split': 40}
0.447 for {'max_depth': 10, 'min_samples_split': 2}
0.447 for {'max_depth': 10, 'min_samples_split': 20}
0.447 for {'max_depth': 10, 'min_samples_split': 30}
0.447 for {'max_depth': 10, 'min_samples_split': 40}
0.793 for {'max_depth': 20, 'min_samples_split': 2}
0.765 for {'max_depth': 20, 'min_samples_split': 20}
0.753 for {'max_depth': 20, 'min_samples_split': 30}
0.743 for {'max_depth': 20, 'min_samples_split': 40}
0.846 for {'max_depth': 30, 'min_samples_split': 2}
0.797 for {'max_depth': 30, 'min_samples_split': 20}
0.779 for {'max_depth': 30, 'min_samples_split': 30}
0.765 for {'max_depth': 30, 'min_samples_split': 40}
0.846 for {'max_depth': 37, 'min_samples_split': 2}
0.796 for {'max_depth': 37, 'min_samples_split': 20}
0.779 for {'max_depth': 37, 'min_samples_split': 30}
0.

In [106]:
model = clf.best_estimator_
params = clf.best_params_
print('Resultado Arbol de Decisión: %0.3f con parametros %r'%( model.score(X_train,Y_train)*100, params))

Resultado Arbol de Decisión: 93.679 con parametros {'max_depth': 37, 'min_samples_split': 2}


In [107]:
arboldecision = DecisionTreeClassifier(max_depth=37, min_samples_split=2)
arboldecision = arboldecision.fit(X_train, Y_train)
columnas = list(x_train.columns.values)
r = export_text(arboldecision, feature_names=columnas)
print(r)
tree.plot_tree(model) 

|--- S1 <= 3.50
|   |--- S3 <= 3.50
|   |   |--- S2 <= 3.50
|   |   |   |--- S5 <= 3.50
|   |   |   |   |--- S4 <= 3.50
|   |   |   |   |   |--- S3 <= 1.50
|   |   |   |   |   |   |--- S5 <= 1.50
|   |   |   |   |   |   |   |--- S2 <= 1.50
|   |   |   |   |   |   |   |   |--- S1 <= 1.50
|   |   |   |   |   |   |   |   |   |--- S4 <= 1.50
|   |   |   |   |   |   |   |   |   |   |--- C3 <= 6.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 11
|   |   |   |   |   |   |   |   |   |   |--- C3 >  6.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 10
|   |   |   |   |   |   |   |   |   |--- S4 >  1.50
|   |   |   |   |   |   |   |   |   |   |--- C4 <= 6.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 12
|   |   |   |   |   |   |   |   |   |   |--- C4 >  6.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 18
|   |   |   |   |   |   |   |   |--- S1 >  1.50
|   |   |   |   |   |   

In [81]:
n_nodes = model.tree_.node_count
children_left = model.tree_.children_left
children_right = model.tree_.children_right
feature = model.tree_.feature
threshold = model.tree_.threshold


node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)]  # seed is the root node id and its parent depth
while len(stack) > 0:
    node_id, parent_depth = stack.pop()
    node_depth[node_id] = parent_depth + 1

    # If we have a test node
    if (children_left[node_id] != children_right[node_id]):
        stack.append((children_left[node_id], parent_depth + 1))
        stack.append((children_right[node_id], parent_depth + 1))
    else:
        is_leaves[node_id] = True

print("The binary tree structure has %s nodes and has "
      "the following tree structure:"
      % n_nodes)
for i in range(0,20):
    if is_leaves[i]:
        print("Nodo=%s Nodo hoja. " % ( i))
    else:
        print("\nnode=%s\n test node: go to node %s if X[:, %s] <= %s \nelse to "
              "node %s."
              % (
                 i,
                 children_left[i],
                 feature[i],
                 threshold[i],
                 children_right[i]
                 ))



[     0      1      2 113246 123482 138848 138849 138850 138851 138852
 138853 138854 138960 139076 139077 139089 139090 139091 139092 139093
 139094 139095 139109 139110 139111 139112 139113]
0
Rules used to predict sample 0: 
139113
0


KeyError: (0, 0)

In [51]:
#NAIVE BAYES 
bayes = MultinomialNB()
bayes.fit(x_train, y_train)
val_cruzada = cross_validate(bayes, x_train, y_train, cv = 3)
print('Resultado Naive Bayes: ', bayes.score(X_train, Y_train)*100)
print('Resultado Naive Bayes: ', val_cruzada['test_score']*100)

Resultado Naive Bayes:  17.142429580475792
Resultado Naive Bayes:  [17.01605462 17.09920741 17.12907302]


In [79]:
y_true_dt, y_pred_dt = Y_test, model.predict(X_test)
y_true_nb, y_pred_nb = Y_test, bayes.predict(X_test)

print(accuracy_score(y_true_dt, y_pred_dt)*100)
print(accuracy_score(y_true_nb, y_pred_nb)*100)

print(confusion_matrix(y_true_dt, y_pred_dt))
print(confusion_matrix(y_true_nb, y_pred_nb))

86.41985545815206
17.12890862871817
[[23464 14677  3952  1759   275   144   131    21]
 [13258 20554  5968  3228   610    76   296    45]
 [ 1630  2853 43791   880   234    10   176    28]
 [  429   861   457 47482    62     2    56     7]
 [   36    61    32    13 49322     0     1     1]
 [   22     9     4     0     0 49744     0     0]
 [   11    30    29    16     1     0 49646     4]
 [    1     2     3     5     1     0     0 49499]]
[[ 1990   160   467  2485  5346 12819  7168 13988]
 [ 2293   184   409  2175  6721 11994  6651 13608]
 [ 2480   126   487  2232 10773 11374  7627 14503]
 [ 2871   169   457  1861 12056 11218  6471 14253]
 [ 3933    38   531  1248 18179  7878  4689 12970]
 [ 3547    76   160  1761 11179 13744  6671 12641]
 [ 2991     9  1047  2131 14574  8555  5576 14854]
 [ 1457     0     0   216 15074  6807  1876 24081]]
