In [60]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split, cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.naive_bayes import MultinomialNB, GaussianNB, CategoricalNB, BernoulliNB, ComplementNB
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.combine import SMOTETomek
import graphviz 
from sklearn.tree import export_text

In [2]:
#Leemos el CSV
df = pd.read_csv('poker-hand-tiny.csv')
print("Contenido de data set: ", df.shape)
print("Cantidad por clase:\n", df.Clase.value_counts())

Contenido de data set:  (300000, 11)
Cantidad por clase:
 0    150289
1    126833
2     14395
3      6238
4      1156
5       598
6       423
7        62
8         4
9         2
Name: Clase, dtype: int64


In [3]:
#Limpiamos el CSV
for i in range(8,10):
    #Creamos lista con los indices a eliminar
    idmemor = df[df['Clase']== i].index
    #Eliminamos los indices del csv
    df = df.drop(idmemor)


In [4]:
#Define cada dataset
y = df.iloc[:,-1]
x = df.iloc[:,0:10]

In [86]:
# CREACIÓN DATOS SINTÉTICOS
smote = SMOTETomek(sampling_strategy='auto')
X_SMOTE, Y_SMOTE = smote.fit_sample(x, y)
print("Antes de SMOTE:", Counter(y),"\nDespués de SMOTE: ", Counter(Y_SMOTE))

         S1  C1  S2  C2  S3  C3  S4  C4  S5  C5
851142    4   6   4   1   4   6   4  10   4   4
41325     3   2   3  12   1   3   1   1   4   6
207671    2  12   4   2   2   7   4  11   4  13
902530    2   2   4   2   2   2   1   1   3   1
390528    1   7   1   5   2   7   3   4   2   4
...      ..  ..  ..  ..  ..  ..  ..  ..  ..  ..
740387    3  10   3   3   3   5   3   8   3   9
998007    4   6   2   8   1   5   3   5   1   8
590424    1   8   3   6   2   8   1   7   3  10
878075    4  10   3  11   2  11   3   8   2   8
1108633   1   2   1   1   3   1   3   1   1   1

[783609 rows x 10 columns] 851142     5
41325      0
207671     0
902530     6
390528     2
          ..
740387     5
998007     6
590424     4
878075     6
1108633    7
Name: Clase, Length: 783609, dtype: int64
Antes de SMOTE: Counter({0: 150289, 1: 126833, 2: 14395, 3: 6238, 4: 1156, 5: 598, 6: 423, 7: 62}) 
Después de SMOTE:  Counter({0: 150289, 1: 150289, 2: 150289, 3: 150289, 5: 150289, 4: 150289, 6: 150289, 7: 150

In [93]:
X_train, X_test, Y_train, Y_test = train_test_split(X_SMOTE, Y_SMOTE, test_size = 0.33, random_state = 324)


In [94]:
#ÁRBOL DE DECISIÓN 
arboldecision = DecisionTreeClassifier()
parametros = {'min_samples_split': [2, 4, 10], 'max_depth': [10, 20, 37, 40]}

In [95]:
# VALIDACIÓN CRUZADA - Grid Search
clf = GridSearchCV(arboldecision, parametros, cv = 3)
clf.fit(X_train, Y_train)
clf.cv_results_
resultados = pd.DataFrame(clf.cv_results_)
means = clf.cv_results_['mean_test_score']
for mean, parametros in zip(means,  clf.cv_results_['params']):
    print("%0.3f for %r" %(mean, parametros))

print()


0.437 for {'max_depth': 10, 'min_samples_split': 2}
0.437 for {'max_depth': 10, 'min_samples_split': 4}
0.437 for {'max_depth': 10, 'min_samples_split': 10}
0.778 for {'max_depth': 20, 'min_samples_split': 2}
0.775 for {'max_depth': 20, 'min_samples_split': 4}
0.766 for {'max_depth': 20, 'min_samples_split': 10}
0.834 for {'max_depth': 37, 'min_samples_split': 2}
0.826 for {'max_depth': 37, 'min_samples_split': 4}
0.808 for {'max_depth': 37, 'min_samples_split': 10}
0.834 for {'max_depth': 40, 'min_samples_split': 2}
0.826 for {'max_depth': 40, 'min_samples_split': 4}
0.808 for {'max_depth': 40, 'min_samples_split': 10}



In [96]:
model = clf.best_estimator_
params = clf.best_params_
print('Resultado Arbol de Decisión: %0.3f con parametros %r'%( model.score(X_train,Y_train)*100, params))

Resultado Arbol de Decisión: 99.986 con parametros {'max_depth': 37, 'min_samples_split': 2}


In [42]:
'''
columnas = list(X_train.columns.values)
r = export_text(model, feature_names=columnas)
print(r)
tree.plot_tree(model) 
'''

'\ncolumnas = list(X_train.columns.values)\nr = export_text(model, feature_names=columnas)\nprint(r)\ntree.plot_tree(model) \n'

In [101]:
n_nodes = model.tree_.node_count
children_left = model.tree_.children_left
children_right = model.tree_.children_right
feature = model.tree_.feature
threshold = model.tree_.threshold


node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)]  # seed is the root node id and its parent depth
while len(stack) > 0:
    node_id, parent_depth = stack.pop()
    node_depth[node_id] = parent_depth + 1

    # If we have a test node
    if (children_left[node_id] != children_right[node_id]):
        stack.append((children_left[node_id], parent_depth + 1))
        stack.append((children_right[node_id], parent_depth + 1))
    else:
        is_leaves[node_id] = True

print("The binary tree structure has %s nodes and has "
      "the following tree structure:"
      % n_nodes)
for i in range(0,20):
    if is_leaves[i]:
        print("Nodo=%s Nodo hoja. " % ( i))
    else:
        print("\nnode=%s\n test node: go to node %s if X[:, %s] <= %s \nelse to "
              "node %s."
              % (
                 i,
                 children_left[i],
                 feature[i],
                 threshold[i],
                 children_right[i]
                 ))



The binary tree structure has 241805 nodes and has the following tree structure:

node=0
 test node: go to node 1 if X[:, 0] <= 3.5 
else to node 191604.

node=1
 test node: go to node 2 if X[:, 4] <= 3.5 
else to node 152321.

node=2
 test node: go to node 3 if X[:, 2] <= 3.5 
else to node 119948.

node=3
 test node: go to node 4 if X[:, 6] <= 3.5 
else to node 96223.

node=4
 test node: go to node 5 if X[:, 8] <= 3.5 
else to node 75602.

node=5
 test node: go to node 6 if X[:, 4] <= 1.5 
else to node 25457.

node=6
 test node: go to node 7 if X[:, 8] <= 1.5 
else to node 8308.

node=7
 test node: go to node 8 if X[:, 2] <= 1.5 
else to node 2357.

node=8
 test node: go to node 9 if X[:, 0] <= 1.5 
else to node 642.

node=9
 test node: go to node 10 if X[:, 6] <= 1.5 
else to node 195.

node=10
 test node: go to node 11 if X[:, 5] <= 6.5 
else to node 86.

node=11
 test node: go to node 12 if X[:, 5] <= 3.5 
else to node 53.

node=12
 test node: go to node 13 if X[:, 7] <= 1.5 
else 

In [99]:
#NAIVE BAYES 
#bayes = MultinomialNB()
bayes = GaussianNB()
#bayes = CategoricalNB()
#bayes = BernoulliNB()
#bayes = ComplementNB()

bayes.fit(X_train, Y_train)
val_cruzada = cross_validate(bayes, X_train, Y_train, cv = 3)
print('Resultado Naive Bayes: ', val_cruzada['test_score'])


Resultado Naive Bayes:  [0.20263521 0.20272163 0.20106809]


In [100]:

y_true_dt, y_pred_dt = Y_test, model.predict(X_test)
y_true_nb, y_pred_nb = Y_test, bayes.predict(X_test)

print(accuracy_score(y_true_dt, y_pred_dt)*100, "\n")
print(accuracy_score(y_true_nb, y_pred_nb)*100, "\n")

print(confusion_matrix(y_true_dt, y_pred_dt), "\n")
print(confusion_matrix(y_true_nb, y_pred_nb), "\n")

print(classification_report(y_true_dt, y_pred_dt))
print(classification_report(y_true_nb, y_pred_nb))


85.2161113813536 

20.179553032918896 

[[25838 17121  4160  1883   274   143   152    26]
 [15681 22813  6386  3451   616    81   319    43]
 [ 1755  2917 43608   870   227    17   185    35]
 [  444   983   471 47496    61     0    59    17]
 [   26    64    39    16 49782     0     3     1]
 [    9     7     0     1     0 49649     1     0]
 [   14    30    27    16     3     0 49478     5]
 [    0     5     5     6     0     0     2 49442]] 

[[22476  3034  1297  2485  5217  3633  3867  7588]
 [20397  2833  1126  2278  7797  3367  3126  8466]
 [10022  2183  1450  2613 12106  4007  4440 12793]
 [ 8043  2367  1008  3128 14201  3658  3513 13613]
 [ 7030  1107   418  2034 23345  1789  1937 12271]
 [17613     0     0  1190  4897  2668 10317 12982]
 [ 5658  1105   610  2580 16581  3462  3191 16386]
 [ 2384  1169     0  1837 19586  2007  1503 20974]] 

              precision    recall  f1-score   support

           0       0.59      0.52      0.55     49597
           1       0.52      

In [72]:
#SIN SMOTE
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = 324)
