In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.combine import SMOTETomek
import graphviz 

In [2]:
#Leemos el CSV
df = pd.read_csv('poker-hand-tiny.csv')
print("Contenido de data set: ", df.shape)
print("Cantidad por clase:\n", df.Clase.value_counts())

Contenido de data set:  (300000, 11)
Cantidad por clase:
 0    150289
1    126833
2     14395
3      6238
4      1156
5       598
6       423
7        62
8         4
9         2
Name: Clase, dtype: int64


In [3]:
#Limpiamos el CSV
for i in range(8,10):
    #Creamos lista con los indices a eliminar
    idmemor = df[df['Clase']== i].index
    #Eliminamos los indices del csv
    df = df.drop(idmemor)


In [4]:
#Define cada dataset
y_train = df.iloc[:,10]
x_train = df.iloc[:,0:10]

In [5]:
# CREACIÓN DATOS SINTÉTICOS
smote = SMOTETomek(sampling_strategy='auto')
X_train, Y_train = smote.fit_sample(x_train, y_train)
print("Antes de SMOTE:", Counter(y_train),"\nDespués de SMOTE: ", Counter(Y_train))

Antes de SMOTE: Counter({0: 150289, 1: 126833, 2: 14395, 3: 6238, 4: 1156, 5: 598, 6: 423, 7: 62}) 
Después de SMOTE:  Counter({7: 150289, 5: 150285, 6: 150285, 4: 150280, 3: 150185, 2: 149738, 0: 134240, 1: 133876})


In [13]:
#ÁRBOL DE DECISIÓN 
arboldecision = DecisionTreeClassifier()
parametros = {'max_depth': [10, 20], 'min_samples_split': [2, 20]}

In [14]:
# VALIDACIÓN CRUZADA - Grid Search
clf = GridSearchCV(arboldecision, parametros, cv = 3)
clf.fit(X_train, Y_train)
clf.cv_results_
resultados = pd.DataFrame(clf.cv_results_)
means = clf.cv_results_['mean_test_score']
for mean, parametros in zip(means,  clf.cv_results_['params']):
    print("%0.3f for %r" %(mean, parametros))

print()

0.447 for {'max_depth': 10, 'min_samples_split': 2}
0.447 for {'max_depth': 10, 'min_samples_split': 20}
0.802 for {'max_depth': 20, 'min_samples_split': 2}
0.780 for {'max_depth': 20, 'min_samples_split': 20}



In [15]:
model = clf.best_estimator_
print('Resultado Arbol de Decisión:' , model.score(X_train,Y_train)*100, model)

Resultado Arbol de Decisión: 88.88047842159193 DecisionTreeClassifier(max_depth=20)


In [31]:
n_nodes = model.tree_.node_count
children_left = model.tree_.children_left
children_right = model.tree_.children_right
feature = model.tree_.feature
threshold = model.tree_.threshold

node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)]  # seed is the root node id and its parent depth
while len(stack) > 0:
    node_id, parent_depth = stack.pop()
    node_depth[node_id] = parent_depth + 1

    # If we have a test node
    if (children_left[node_id] != children_right[node_id]):
        stack.append((children_left[node_id], parent_depth + 1))
        stack.append((children_right[node_id], parent_depth + 1))
    else:
        is_leaves[node_id] = True

print("The binary tree structure has %s nodes and has "
      "the following tree structure:"
      % n_nodes)
for i in range(0,20):
    if is_leaves[i]:
        print("Nodo=%s Nodo hoja. " % ( i))
    else:
        print("\nnode=%s\n test node: go to node %s if X[:, %s] <= %s \nelse to "
              "node %s."
              % (
                 i,
                 children_left[i],
                 feature[i],
                 threshold[i],
                 children_right[i]
                 ))





The binary tree structure has 165767 nodes and has the following tree structure:

node=0
 test node: go to node 1 if X[:, 0] <= 3.5 
else to node 120776.

node=1
 test node: go to node 2 if X[:, 4] <= 3.5 
else to node 85997.

node=2
 test node: go to node 3 if X[:, 2] <= 3.5 
else to node 59256.

node=3
 test node: go to node 4 if X[:, 8] <= 3.5 
else to node 39659.

node=4
 test node: go to node 5 if X[:, 6] <= 3.5 
else to node 29244.

node=5
 test node: go to node 6 if X[:, 4] <= 1.5 
else to node 13429.

node=6
 test node: go to node 7 if X[:, 8] <= 1.5 
else to node 5330.

node=7
 test node: go to node 8 if X[:, 0] <= 1.5 
else to node 2017.

node=8
 test node: go to node 9 if X[:, 2] <= 1.5 
else to node 628.

node=9
 test node: go to node 10 if X[:, 6] <= 1.5 
else to node 179.

node=10
 test node: go to node 11 if X[:, 5] <= 6.5 
else to node 80.

node=11
 test node: go to node 12 if X[:, 5] <= 3.5 
else to node 41.

node=12
 test node: go to node 13 if X[:, 7] <= 1.5 
else to

In [17]:
#NAIVE BAYES 
bayes = MultinomialNB()
bayes.fit(X_train, Y_train)
val_cruzada = cross_validate(bayes, X_train, Y_train, cv = 3)
print('Resultado Naive Bayes: ', bayes.score(X_train, Y_train)*100)
print('Resultado Naive Bayes: ', val_cruzada['test_score']*100)

Resultado Naive Bayes:  16.797100184916243
Resultado Naive Bayes:  [16.69942472 16.83721384 16.84953018]


In [18]:
y_true_dt, y_pred_dt = Y_train, model.predict(X_train)
y_true_nb, y_pred_nb = Y_train, bayes.predict(X_train)
print(accuracy_score(y_true_dt, y_pred_dt)*100)
print(accuracy_score(y_true_nb, y_pred_nb)*100)
print(confusion_matrix(y_true_dt, y_pred_dt))
print(confusion_matrix(y_true_nb, y_pred_nb))

88.88047842159193
16.797100184916243
[[108348   8551   7354   5211   1473   1720   1265    318]
 [ 13135  94980  10259   8511   3343   1134   1932    582]
 [  3288   3543 121325   7472   7282   1168   4322   1338]
 [  1340   1797   4781 128679   7194    834   3705   1855]
 [    45     65    524   1539 142497    342   3122   2146]
 [    66      5     36    178    207 149742     51      0]
 [    98     41    542    542   3415    230 144334   1083]
 [     2      4      3     96    502    178    238 149266]]
[[ 6245     3  1150  8148 15950 38973 22048 41723]
 [ 7413    11  1001  7175 20450 36495 20552 40779]
 [ 7654     2  1033  7347 32323 34839 23293 43247]
 [ 8849     0  1237  5857 36925 35524 19665 42128]
 [12917     0  1267  4207 53870 24386 15139 38494]
 [10742     0   259  6576 32303 41810 20503 38092]
 [ 9657     0  1863  8231 44269 24003 18150 44112]
 [ 7436     0     0   410 45612 21190  6229 69412]]


In [None]:
print(model.get_depth())
print(model.get_n_leaves())