In [None]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

In [149]:
# Load the data
poker = pd.read_csv(
    'poker-hand-training-true.data', 
    header=None, 
    names=['S1', 'C1', 'S2', 'C2', 'S3', 'C3', 'S4', 'C4', 'S5', 'C5', 'CLASS']
)

poker_test = pd.read_csv(
    'poker-hand-testing.data', 
    header=None, 
    names=['S1', 'C1', 'S2', 'C2', 'S3', 'C3', 'S4', 'C4', 'S5', 'C5', 'CLASS']
)

x_train = poker.drop('CLASS', axis=1)
y_train = poker['CLASS']
x_test = poker_test.drop('CLASS', axis=1)
y_test = poker_test['CLASS']

scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [27]:
sample_weights = compute_sample_weight('balanced', y_train)
dt = DecisionTreeClassifier(random_state=0)
dt.fit(x_train, y_train, sample_weight=sample_weights)

In [33]:
# Predict the test data
y_pred = dt.predict(x_test)

# Calculate the accuracy
accuracy = np.mean(y_pred == y_test)
print(f'Accuracy: {accuracy:.2f}')
train_accuracy = np.mean(dt.predict(x_train) == y_train)
print(f'Train accuracy: {train_accuracy:.2f}')

# Calculate the confusion matrix
confusion_matrix = pd.crosstab(
    y_test, y_pred, rownames=['Actual'], colnames=['Predicted']
)
print(confusion_matrix)

Accuracy: 0.49
Train accuracy: 1.00
Predicted       0       1      2      3     4     5     6    7    8    9
Actual                                                                  
0          287647  184159  17990   7908  1291  1448   609   74   24   59
1          181877  196724  28371  11029  2044   883  1156  169   37  208
2           14958   24810   5212   1807   449    55   274   26    5   26
3            6311   10304   1772   2241   240    17   166   43    2   25
4            1125    1912    385    249   144     9    26    2    8   25
5             977     514     51     28     7   310     0    0  108    1
6             311     730    194    142    26     2    16    3    0    0
7              29     118     24     48     6     0     3    1    0    1
8               2       5      2      0     1     2     0    0    0    0
9               1       1      1      0     0     0     0    0    0    0


In [34]:
rf = RandomForestClassifier(random_state=0)
rf.fit(x_train, y_train, sample_weight=sample_weights)

# Predict the test data
y_pred = rf.predict(x_test)

# Calculate the accuracy
accuracy = np.mean(y_pred == y_test)
print(f'Accuracy: {accuracy:.2f}')
train_accuracy = np.mean(rf.predict(x_train) == y_train)
print(f'Train accuracy: {train_accuracy:.2f}')

# Calculate the confusion matrix
confusion_matrix = pd.crosstab(
    y_test, y_pred, rownames=['Actual'], colnames=['Predicted']
)
print(confusion_matrix)

Accuracy: 0.60
Train accuracy: 1.00
Predicted       0       1    2   3   4   5  6  7  9
Actual                                             
0          403625   97568   14   0   1   1  0  0  0
1          228922  193353  178  31  11   0  1  0  2
2           15141   32367  108   6   0   0  0  0  0
3            4738   16293   12  77   0   0  0  1  0
4             647    3227    5   0   3   0  0  0  3
5            1766     134    0   0   0  96  0  0  0
6             144    1269    6   5   0   0  0  0  0
7               7     215    2   6   0   0  0  0  0
8               5       5    0   0   0   2  0  0  0
9               1       2    0   0   0   0  0  0  0


---

In [None]:
def create_feats(df):
    ''' Create features for the poker hand dataset '''
    df_copy = df.copy()
    # count cards of same rank
    for card in range (1, 14):
        df_copy[f'card {card}'] = df_copy[[f'C{i}' for i in range(1, 6)]].eq(card).sum(axis=1)
    # count cards of same suit
    for suit in range (1, 5):
        df_copy[f'suit {suit}'] = df_copy[[f'S{i}' for i in range(1, 6)]].eq(suit).sum(axis=1)
    # check sequential
    df_copy['sorted_rank'] = df_copy.apply(lambda row: sorted([row[f'C{i}'] for i in range(1, 6)]), axis=1)
    df_copy['is_sequental'] = df_copy['sorted_rank'].apply(lambda x: all(x[i+1] - x[i] == 1 for i in range(len(x)-1)))
    df_copy['is_sequental'] = df_copy.apply(
        lambda row: True if row['sorted_rank'] == [1, 10, 11, 12, 13] else row['is_sequental'], axis=1
    )
    # check flush
    df_copy['is_flush'] = df_copy.apply(lambda row: any(row[f'suit {i}'] == 5 for i in range(1, 5)), axis=1)
    # Identify highest card count
    df_copy['max_card_count'] = df_copy[[f'card {i}' for i in range(1, 14)]].max(axis=1)
    # Identify second highest card count
    df_copy['second_max_card_count'] = df_copy.apply(
        lambda x: sorted([x[f'card {i}'] for i in range(1, 14)])[-2], 
        axis=1
    )
    df_copy.drop(['S1', 'C1', 'S2', 'C2', 'S3', 'C3', 'S4', 'C4', 'S5', 'C5', 'sorted_rank'], axis=1, inplace=True)
    return df_copy

---

In [None]:
def create_feats_with_card_counts(df):
    ''' Create features for the poker hand dataset and with highest and 2nd highest card counts '''
    df_copy = df.copy()
    # count cards of same rank
    for card in range (1, 14):
        df_copy[f'card {card}'] = df_copy[[f'C{i}' for i in range(1, 6)]].eq(card).sum(axis=1)
    # count cards of same suit
    for suit in range (1, 5):
        df_copy[f'suit {suit}'] = df_copy[[f'S{i}' for i in range(1, 6)]].eq(suit).sum(axis=1)
    # check sequential
    df_copy['sorted_rank'] = df_copy.apply(lambda row: sorted([row[f'C{i}'] for i in range(1, 6)]), axis=1)
    df_copy['is_sequental'] = df_copy['sorted_rank'].apply(lambda x: all(x[i+1] - x[i] == 1 for i in range(len(x)-1)))
    df_copy['is_sequental'] = df_copy.apply(
        lambda row: True if row['sorted_rank'] == [1, 10, 11, 12, 13] else row['is_sequental'], axis=1
    )
    # check flush
    df_copy['is_flush'] = df_copy.apply(lambda row: any(row[f'suit {i}'] == 5 for i in range(1, 5)), axis=1)
    # Identify highest card count
    df_copy['max_card_count'] = df_copy[[f'card {i}' for i in range(1, 14)]].max(axis=1)
    # Identify second highest card count
    df_copy['second_max_card_count'] = df_copy.apply(
        lambda x: sorted([x[f'card {i}'] for i in range(1, 14)])[-2], 
        axis=1
    )
    df_copy.drop(['S1', 'C1', 'S2', 'C2', 'S3', 'C3', 'S4', 'C4', 'S5', 'C5', 'sorted_rank'], axis=1, inplace=True)
    return df_copy

In [175]:
poker_after = create_feats(poker)
poker_test_after = create_feats(poker_test)

x_train = poker_after.drop('CLASS', axis=1)
y_train = poker_after['CLASS']
x_test = poker_test_after.drop('CLASS', axis=1)
y_test = poker_test_after['CLASS']

In [176]:
sample_weights = compute_sample_weight('balanced', y_train)
dt = DecisionTreeClassifier(random_state=0)
dt.fit(x_train, y_train, sample_weight=sample_weights)

# Predict the test data
y_pred = dt.predict(x_test)

# Calculate the accuracy
accuracy = np.mean(y_pred == y_test)
print(f'Accuracy: {accuracy:.2f}')
train_accuracy = np.mean(dt.predict(x_train) == y_train)
print(f'Train accuracy: {train_accuracy:.2f}')

# Calculate the confusion matrix
confusion_matrix = pd.crosstab(
    y_test, y_pred, rownames=['Actual'], colnames=['Predicted']
)
print(confusion_matrix)

Accuracy: 1.00
Train accuracy: 1.00
Predicted       0       1      2      3     4     5     6    7   8  9
Actual                                                               
0          501209       0      0      0     0     0     0    0   0  0
1               0  422498      0      0     0     0     0    0   0  0
2               0       0  47622      0     0     0     0    0   0  0
3               0       0      0  21121     0     0     0    0   0  0
4               0       0      0      0  3885     0     0    0   0  0
5               0       0      0      0     0  1996     0    0   0  0
6               0       0      0      0     0     0  1424    0   0  0
7               0       0      0      0     0     0     0  230   0  0
8               0       0      0      0     0     0     0    0  12  0
9               0       0      0      0     0     0     0    0   0  3


In [177]:
rf = RandomForestClassifier(random_state=0)
rf.fit(x_train, y_train, sample_weight=sample_weights)

# Predict the test data
y_pred = rf.predict(x_test)

# Calculate the accuracy
accuracy = np.mean(y_pred == y_test)
print(f'Accuracy: {accuracy:.2f}')
train_accuracy = np.mean(rf.predict(x_train) == y_train)
print(f'Train accuracy: {train_accuracy:.2f}')

# Calculate the confusion matrix
confusion_matrix = pd.crosstab(
    y_test, y_pred, rownames=['Actual'], colnames=['Predicted']
)
print(confusion_matrix)

Accuracy: 1.00
Train accuracy: 1.00
Predicted       0       1      2      3     4     5     6    7  8  9
Actual                                                              
0          501209       0      0      0     0     0     0    0  0  0
1               0  422498      0      0     0     0     0    0  0  0
2               0       0  47622      0     0     0     0    0  0  0
3               0       0      0  21121     0     0     0    0  0  0
4               0       0      0      0  3885     0     0    0  0  0
5               0       0      0      0     0  1996     0    0  0  0
6               0       0      0      0     0     0  1424    0  0  0
7               0       0      0     82     0     0     0  148  0  0
8               0       0      0      0     0     3     0    0  8  1
9               0       0      0      0     0     0     0    0  0  3


---

In [178]:
# First level estimators
estimators = [
    ('rf1', RandomForestClassifier(
        n_estimators=100,
        max_depth=15,
        min_samples_split=5,
        class_weight='balanced'
    )),
    ('rf2', RandomForestClassifier(
        n_estimators=100,
        max_depth=20,
        min_samples_split=2,
        class_weight='balanced'
    )),
    ('dt', DecisionTreeClassifier(
        max_depth=15,
        min_samples_split=5,
        class_weight='balanced'
    ))
]

# Stack them with a final estimator
stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=5
)

stack.fit(x_train, y_train)

# Predict the test data
y_pred = stack.predict(x_test)

# Calculate the accuracy
accuracy = np.mean(y_pred == y_test)
print(f'Accuracy: {accuracy:.2f}')
train_accuracy = np.mean(stack.predict(x_train) == y_train)
print(f'Train accuracy: {train_accuracy:.2f}')

# Calculate the confusion matrix
confusion_matrix = pd.crosstab(
    y_test, y_pred, rownames=['Actual'], colnames=['Predicted']
)
print(confusion_matrix)

Accuracy: 1.00
Train accuracy: 1.00
Predicted       0       1      2      3     4     5     6    7   8  9
Actual                                                               
0          501209       0      0      0     0     0     0    0   0  0
1               0  422498      0      0     0     0     0    0   0  0
2               0       0  47622      0     0     0     0    0   0  0
3               0       0      0  21121     0     0     0    0   0  0
4               0       0      0      0  3885     0     0    0   0  0
5               0       0      0      0     0  1996     0    0   0  0
6               0       0      0      0     0     0  1424    0   0  0
7               0       0      0      0     0     0     0  230   0  0
8               0       0      0      0     0     0     0    0  12  0
9               0       0      0      0     0     0     0    0   0  3


---

In [None]:
# # find the difference between test and predicted rows
# diff = x_test.loc[y_test != y_pred]
# diff['Actual'] = y_test[y_test != y_pred]
# diff['Predicted'] = y_pred[y_test != y_pred]
# diff

In [None]:
# poker_test.iloc[64]

S1        2
C1       12
S2        3
C2       12
S3        3
C3        7
S4        3
C4       11
S5        2
C5        7
CLASS     2
Name: 64, dtype: int64