In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split, cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix, classification_report, precision_score
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.combine import SMOTETomek
import graphviz 
from sklearn.tree import export_text, _tree

In [23]:
#Leemos el CSV
df = pd.read_csv('poker-hand-tiny.csv')
print("Forma de data set: ", df.shape)
print("Cantidad por clase:\n", df.Clase.value_counts())

Forma de data set:  (300000, 11)
Cantidad por clase:
 0    150289
1    126833
2     14395
3      6238
4      1156
5       598
6       423
7        62
8         4
9         2
Name: Clase, dtype: int64


In [25]:
#Limpiamos el CSV
for i in range(8,10):
    #Creamos lista con los indices a eliminar
    idmemor = df[df['Clase']== i].index
    #Eliminamos los indices del csv
    df = df.drop(idmemor)


In [26]:
#Define cada dataset
y = df.iloc[:,-1]
x = df.iloc[:,0:10]

Forma de data set:  (299994,)
Forma de data set:  (299994, 10)


In [6]:
# CREACIÓN DATOS SINTÉTICOS
smote = SMOTETomek(sampling_strategy='auto')
X_SMOTE, Y_SMOTE = smote.fit_sample(x, y)
print("Antes de SMOTE:", Counter(y),"\nDespués de SMOTE: ", Counter(Y_SMOTE))

Antes de SMOTE: Counter({0: 150289, 1: 126833, 2: 14395, 3: 6238, 4: 1156, 5: 598, 6: 423, 7: 62}) 
Después de SMOTE:  Counter({7: 150289, 5: 150285, 6: 150285, 4: 150281, 3: 150198, 2: 149765, 0: 134387, 1: 134036})


In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X_SMOTE, Y_SMOTE, test_size = 0.33, random_state = 324)


In [13]:
#ÁRBOL DE DECISIÓN 
arboldecision = DecisionTreeClassifier()
parametros = {'min_samples_split': [2, 4, 10], 'max_depth': [10, 20, 37, 40]}

In [14]:
# VALIDACIÓN CRUZADA - Grid Search
clf = GridSearchCV(arboldecision, parametros, cv = 3)
clf.fit(X_train, Y_train)
clf.cv_results_
resultados = pd.DataFrame(clf.cv_results_)
means = clf.cv_results_['mean_test_score']
for mean, parametros in zip(means,  clf.cv_results_['params']):
    print("%0.3f for %r" %(mean, parametros))

print()


0.445 for {'max_depth': 10, 'min_samples_split': 2}
0.445 for {'max_depth': 10, 'min_samples_split': 4}
0.445 for {'max_depth': 10, 'min_samples_split': 10}
0.791 for {'max_depth': 20, 'min_samples_split': 2}
0.788 for {'max_depth': 20, 'min_samples_split': 4}
0.778 for {'max_depth': 20, 'min_samples_split': 10}
0.846 for {'max_depth': 37, 'min_samples_split': 2}
0.838 for {'max_depth': 37, 'min_samples_split': 4}
0.819 for {'max_depth': 37, 'min_samples_split': 10}
0.846 for {'max_depth': 40, 'min_samples_split': 2}
0.838 for {'max_depth': 40, 'min_samples_split': 4}
0.819 for {'max_depth': 40, 'min_samples_split': 10}



In [28]:
model = clf.best_estimator_
params = clf.best_params_
val_cruzada = cross_validate(model, X_train, Y_train, cv = 3)
print('Test Score Arbol de Decisión: %r con parametros %r'%(val_cruzada['test_score'], params))

Test Score Arbol de Decisión: array([0.84553244, 0.84605695, 0.84613353]) con parametros {'max_depth': 40, 'min_samples_split': 2}


In [29]:
#NAIVE BAYES 
bayes = GaussianNB()
bayes.fit(X_train, Y_train)
val_cruzada = cross_validate(bayes, X_train, Y_train, cv = 3)
print('Test Score Naive Bayes: ', val_cruzada['test_score'])


Test Score Naive Bayes:  [0.20418157 0.20522677 0.20395951]


In [22]:
#Creo los datos a testear
y_true_dt, y_pred_dt = Y_test, model.predict(X_test)
y_true_nb, y_pred_nb = Y_test, bayes.predict(X_test)
#Accuracy
print(accuracy_score(y_true_dt, y_pred_dt)*100, "\n")
print(accuracy_score(y_true_nb, y_pred_nb)*100, "\n")

#Matriz de confusión
print(confusion_matrix(y_true_dt, y_pred_dt), "\n")
print(confusion_matrix(y_true_nb, y_pred_nb), "\n")

#Recall y Precision
print(classification_report(y_true_dt, y_pred_dt))
print(classification_report(y_true_nb, y_pred_nb))


86.47964471529548 

20.64988703024273 

[[23516 14759  3921  1760   262   145   129    17]
 [12882 20752  6137  3316   590    68   319    47]
 [ 1634  2761 43759   829   232    18   200    24]
 [  407   849   479 47474    65     4    53    17]
 [   32    78    25    10 49341     0     1     0]
 [    8    12     0     0     1 49682     1     0]
 [    5    25    23    13     9     0 49685     2]
 [    0     2     5     5     0     0     0 49554]] 

[[18759  1079  1924  2672  5573  4009  3383  7110]
 [16374   998  1772  2476  8035  3690  2788  7978]
 [ 8514   787  2268  3105 12994  4443  4243 13103]
 [ 6911   766  1621  3837 14937  4053  3354 13869]
 [ 5934   200   529  2360 24496  1990  1860 12118]
 [15076     0     0  1139  5327  4857  9870 13435]
 [ 5047   126   793  2683 17701  3596  3245 16571]
 [ 2233   266    15  2553 20187  1877  1198 21237]] 

              precision    recall  f1-score   support

           0       0.61      0.53      0.57     44509
           1       0.53      

In [31]:
#Reglas de árbol
def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    print("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print( "{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print( "{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print( "{}return {}".format(indent, tree_.value[node]))

    recurse(0, 1)

tree_to_code(model, list(X_train.columns.values))

def tree(S1, C1, S2, C2, S3, C3, S4, C4, S5, C5):
  if S1 <= 3.5:
    if S3 <= 3.5:
      if S2 <= 3.5:
        if S4 <= 3.5:
          if S5 <= 3.5:
            if S3 <= 1.5:
              if S5 <= 1.5:
                if S2 <= 1.5:
                  if S4 <= 1.5:
                    if S1 <= 1.5:
                      if C3 <= 6.5:
                        if C3 <= 3.5:
                          if C4 <= 1.5:
                            if C2 <= 5.5:
                              if C1 <= 3.5:
                                return [[0. 0. 0. 0. 6. 0. 0. 0.]]
                              else:  # if C1 > 3.5
                                return [[0. 0. 2. 0. 0. 0. 0. 0.]]
                            else:  # if C2 > 5.5
                              return [[ 0.  0.  0.  0.  0. 99.  0.  0.]]
                          else:  # if C4 > 1.5
                            if C4 <= 3.5:
                              if C2 <= 9.0:
                                return [[  0.   0.   0.   0.

KeyboardInterrupt: 

In [32]:
#Dibujar árbol estructura del árbol
n_nodes = model.tree_.node_count
children_left = model.tree_.children_left
children_right = model.tree_.children_right
feature = model.tree_.feature
threshold = model.tree_.threshold


node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)] 
while len(stack) > 0:
    node_id, parent_depth = stack.pop()
    node_depth[node_id] = parent_depth + 1

   
    if (children_left[node_id] != children_right[node_id]):
        stack.append((children_left[node_id], parent_depth + 1))
        stack.append((children_right[node_id], parent_depth + 1))
    else:
        is_leaves[node_id] = True

print("The binary tree structure has %s nodes and has "
      "the following tree structure:"
      % n_nodes)
for i in range(n_nodes):
    if is_leaves[i]:
        print("Nodo=%s Nodo hoja. " % ( i))
    else:
        print("\nNodo=%s\n Voy al nodo %s Si X[:, %s] <= %s \nSino "
              "Nodo %s."
              % (
                 i,
                 children_left[i],
                 feature[i],
                 threshold[i],
                 children_right[i]
                 ))



The binary tree structure has 221707 nodes and has the following tree structure:

Nodo=0
 Voy al nodo 1 Si X[:, 0] <= 3.5 
Sino Nodo 177198.

Nodo=1
 Voy al nodo 2 Si X[:, 4] <= 3.5 
Sino Nodo 142099.

Nodo=2
 Voy al nodo 3 Si X[:, 2] <= 3.5 
Sino Nodo 112884.

Nodo=3
 Voy al nodo 4 Si X[:, 6] <= 3.5 
Sino Nodo 91069.

Nodo=4
 Voy al nodo 5 Si X[:, 8] <= 3.5 
Sino Nodo 72036.

Nodo=5
 Voy al nodo 6 Si X[:, 4] <= 1.5 
Sino Nodo 24179.

Nodo=6
 Voy al nodo 7 Si X[:, 8] <= 1.5 
Sino Nodo 7828.

Nodo=7
 Voy al nodo 8 Si X[:, 2] <= 1.5 
Sino Nodo 2323.

Nodo=8
 Voy al nodo 9 Si X[:, 6] <= 1.5 
Sino Nodo 638.

Nodo=9
 Voy al nodo 10 Si X[:, 0] <= 1.5 
Sino Nodo 167.

Nodo=10
 Voy al nodo 11 Si X[:, 5] <= 6.5 
Sino Nodo 84.

Nodo=11
 Voy al nodo 12 Si X[:, 5] <= 3.5 
Sino Nodo 55.

Nodo=12
 Voy al nodo 13 Si X[:, 7] <= 1.5 
Sino Nodo 18.

Nodo=13
 Voy al nodo 14 Si X[:, 3] <= 5.5 
Sino Nodo 17.

Nodo=14
 Voy al nodo 15 Si X[:, 1] <= 3.5 
Sino Nodo 16.
Nodo=15 Nodo hoja. 
Nodo=16 Nodo hoja. 
N

KeyboardInterrupt: 