In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split, cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.naive_bayes import MultinomialNB, GaussianNB, CategoricalNB, BernoulliNB, ComplementNB
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.combine import SMOTETomek
import graphviz 
from sklearn.tree import export_text, _tree

In [2]:
#Leemos el CSV
df = pd.read_csv('poker-hand-tiny.csv')
print("Contenido de data set: ", df.shape)
print("Cantidad por clase:\n", df.Clase.value_counts())

Contenido de data set:  (300000, 11)
Cantidad por clase:
 0    150289
1    126833
2     14395
3      6238
4      1156
5       598
6       423
7        62
8         4
9         2
Name: Clase, dtype: int64


In [3]:
#Limpiamos el CSV
for i in range(8,10):
    #Creamos lista con los indices a eliminar
    idmemor = df[df['Clase']== i].index
    #Eliminamos los indices del csv
    df = df.drop(idmemor)


In [4]:
#Define cada dataset
y = df.iloc[:,-1]
x = df.iloc[:,0:10]

In [5]:
# CREACIÓN DATOS SINTÉTICOS
smote = SMOTETomek(sampling_strategy='auto')
X_SMOTE, Y_SMOTE = smote.fit_sample(x, y)
print("Antes de SMOTE:", Counter(y),"\nDespués de SMOTE: ", Counter(Y_SMOTE))

Antes de SMOTE: Counter({0: 150289, 1: 126833, 2: 14395, 3: 6238, 4: 1156, 5: 598, 6: 423, 7: 62}) 
Después de SMOTE:  Counter({7: 150289, 5: 150286, 6: 150286, 4: 150282, 3: 150186, 2: 149759, 0: 134447, 1: 134049})


In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X_SMOTE, Y_SMOTE, test_size = 0.33, random_state = 324)


In [7]:
#ÁRBOL DE DECISIÓN 
arboldecision = DecisionTreeClassifier()
parametros = {'min_samples_split': [2, 4, 10], 'max_depth': [10, 20, 37, 40]}

In [8]:
# VALIDACIÓN CRUZADA - Grid Search
clf = GridSearchCV(arboldecision, parametros, cv = 3)
clf.fit(X_train, Y_train)
clf.cv_results_
resultados = pd.DataFrame(clf.cv_results_)
means = clf.cv_results_['mean_test_score']
for mean, parametros in zip(means,  clf.cv_results_['params']):
    print("%0.3f for %r" %(mean, parametros))

print()


0.448 for {'max_depth': 10, 'min_samples_split': 2}
0.448 for {'max_depth': 10, 'min_samples_split': 4}
0.448 for {'max_depth': 10, 'min_samples_split': 10}
0.790 for {'max_depth': 20, 'min_samples_split': 2}
0.787 for {'max_depth': 20, 'min_samples_split': 4}
0.778 for {'max_depth': 20, 'min_samples_split': 10}
0.846 for {'max_depth': 37, 'min_samples_split': 2}
0.838 for {'max_depth': 37, 'min_samples_split': 4}
0.820 for {'max_depth': 37, 'min_samples_split': 10}
0.846 for {'max_depth': 40, 'min_samples_split': 2}
0.838 for {'max_depth': 40, 'min_samples_split': 4}
0.820 for {'max_depth': 40, 'min_samples_split': 10}



In [9]:
model = clf.best_estimator_
params = clf.best_params_
print('Resultado Arbol de Decisión: %0.3f con parametros %r'%( model.score(X_train,Y_train)*100, params))

Resultado Arbol de Decisión: 99.987 con parametros {'max_depth': 37, 'min_samples_split': 2}


In [15]:
def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    print("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print( "{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print( "{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print( "{}return {}".format(indent, tree_.value[node]))

    recurse(0, 1)

tree_to_code(model, list(X_train.columns.values))

SyntaxError: invalid syntax (<ipython-input-15-ba7ab8ff4c1b>, line 15)

In [42]:
'''
columnas = list(X_train.columns.values)
r = export_text(model, feature_names=columnas)
print(r)
tree.plot_tree(model) 
'''

'\ncolumnas = list(X_train.columns.values)\nr = export_text(model, feature_names=columnas)\nprint(r)\ntree.plot_tree(model) \n'

In [101]:
n_nodes = model.tree_.node_count
children_left = model.tree_.children_left
children_right = model.tree_.children_right
feature = model.tree_.feature
threshold = model.tree_.threshold


node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)]  # seed is the root node id and its parent depth
while len(stack) > 0:
    node_id, parent_depth = stack.pop()
    node_depth[node_id] = parent_depth + 1

    # If we have a test node
    if (children_left[node_id] != children_right[node_id]):
        stack.append((children_left[node_id], parent_depth + 1))
        stack.append((children_right[node_id], parent_depth + 1))
    else:
        is_leaves[node_id] = True

print("The binary tree structure has %s nodes and has "
      "the following tree structure:"
      % n_nodes)
for i in range(0,20):
    if is_leaves[i]:
        print("Nodo=%s Nodo hoja. " % ( i))
    else:
        print("\nnode=%s\n test node: go to node %s if X[:, %s] <= %s \nelse to "
              "node %s."
              % (
                 i,
                 children_left[i],
                 feature[i],
                 threshold[i],
                 children_right[i]
                 ))



The binary tree structure has 241805 nodes and has the following tree structure:

node=0
 test node: go to node 1 if X[:, 0] <= 3.5 
else to node 191604.

node=1
 test node: go to node 2 if X[:, 4] <= 3.5 
else to node 152321.

node=2
 test node: go to node 3 if X[:, 2] <= 3.5 
else to node 119948.

node=3
 test node: go to node 4 if X[:, 6] <= 3.5 
else to node 96223.

node=4
 test node: go to node 5 if X[:, 8] <= 3.5 
else to node 75602.

node=5
 test node: go to node 6 if X[:, 4] <= 1.5 
else to node 25457.

node=6
 test node: go to node 7 if X[:, 8] <= 1.5 
else to node 8308.

node=7
 test node: go to node 8 if X[:, 2] <= 1.5 
else to node 2357.

node=8
 test node: go to node 9 if X[:, 0] <= 1.5 
else to node 642.

node=9
 test node: go to node 10 if X[:, 6] <= 1.5 
else to node 195.

node=10
 test node: go to node 11 if X[:, 5] <= 6.5 
else to node 86.

node=11
 test node: go to node 12 if X[:, 5] <= 3.5 
else to node 53.

node=12
 test node: go to node 13 if X[:, 7] <= 1.5 
else 

In [10]:
#NAIVE BAYES 
#bayes = MultinomialNB()
bayes = GaussianNB()
#bayes = CategoricalNB()
#bayes = BernoulliNB()
#bayes = ComplementNB()

bayes.fit(X_train, Y_train)
val_cruzada = cross_validate(bayes, X_train, Y_train, cv = 3)
print('Resultado Naive Bayes: ', val_cruzada['test_score'])


Resultado Naive Bayes:  [0.20480692 0.20429008 0.20416758]


In [11]:

y_true_dt, y_pred_dt = Y_test, model.predict(X_test)
y_true_nb, y_pred_nb = Y_test, bayes.predict(X_test)

print(accuracy_score(y_true_dt, y_pred_dt)*100, "\n")
print(accuracy_score(y_true_nb, y_pred_nb)*100, "\n")

print(confusion_matrix(y_true_dt, y_pred_dt), "\n")
print(confusion_matrix(y_true_nb, y_pred_nb), "\n")

print(classification_report(y_true_dt, y_pred_dt))
print(classification_report(y_true_nb, y_pred_nb))


86.49196943748494 

20.442374009943958 

[[23655 14689  3986  1686   258    98   135    15]
 [13052 20667  6022  3228   610    76   342    45]
 [ 1579  2798 43872   859   233    16   189    24]
 [  376   904   493 47525    92     2    52    19]
 [   33    65    40     8 49509     0     5     0]
 [   12     4     1     0     0 49333     0     0]
 [    7    34    15    13     2     0 49564     9]
 [    0     2     3     5     0     0     0 49702]] 

[[18447  1454  1834  2533  5342  4146  3627  7139]
 [16050  1411  1510  2324  7916  3952  2964  7915]
 [ 8161  1168  1952  3146 12756  4861  4554 12972]
 [ 6428  1136  1530  3621 15023  4253  3657 13815]
 [ 5438   538   455  2626 24051  2102  2195 12255]
 [14944     0     0  1076  4997  4977 10120 13236]
 [ 4546   831   866  2755 17337  3890  3228 16191]
 [ 1047  1234     0  3035 20110  1888  1185 21213]] 

              precision    recall  f1-score   support

           0       0.61      0.53      0.57     44522
           1       0.53     

In [72]:
#SIN SMOTE
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = 324)
