# Poker Data Set
    I have a data set of hand information for the 5 drawn cards with the labels denoting the strenght of the hand.
        Train Data Set Size: 25K
        Test Data Set Size: 1M
    Even numbered Columns(0,2,4,6,8) represent the Suit of the drawn card.
    Odd numbered Columns(1,3,5,7,9) represent the Rank of the drawn card.
    
    The goal is to build a model which correctly classifies the strenght of the hand.
    
    For this, I have used a simple Decision Tree Model with Feature Engineering using Basic Algebra and Stats.

### Target Label Classification:
    0: High card
    1: One Pair
    2: Two Pair
    3: Three of a Kind
    4: Straight
    5: Flush
    6: Full House
    7: Four of a Kind
    8: Straight Flush
    9: Royal FLush

### Importing Libraries and Reading Datasets:

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import scipy.stats as sm
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,confusion_matrix
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('poker-hand-training.csv',header=None)
test_df = pd.read_csv('poker-hand-testing.csv',header=None)
print(df.shape)
print(test_df.shape)

(25010, 11)
(1000000, 11)


### Manually picked this row indexes so as to show how data changes for every target class

In [3]:
df.iloc[[11,10,14,59,75,30,97,16,402,73,425,513,8466,19743,5,2,4]]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
11,2,6,2,1,4,13,2,4,4,9,0
10,1,1,2,1,3,9,1,5,2,3,1
14,3,8,4,12,3,9,4,2,3,2,1
59,1,5,3,13,2,13,2,7,4,5,2
75,1,4,2,13,3,1,4,4,4,1,2
30,3,13,2,7,4,11,3,11,2,11,3
97,1,9,4,6,4,9,2,9,1,1,3
16,1,4,1,1,1,3,3,5,3,2,4
402,1,11,3,12,3,10,4,9,2,13,4
73,4,8,4,4,4,2,4,1,4,7,5


### Removed the  suit data and create a new feature Column: 'Is_suited'
    Is_suited:
        1 = All cards are of same suit 
        0 = All cards are of different suit

In [4]:
def is_suit_conv(df):
    same_suit_index = (df[df[0]==df[2]][(df[df[0]==df[2]][0]==df[df[0]==df[2]][4])][df[df[0]==df[2]][(df[df[0]==df[2]][0]==df[df[0]==df[2]][4])][0]==df[df[0]==df[2]][(df[df[0]==df[2]][0]==df[df[0]==df[2]][4])][6]][(df[df[0]==df[2]][(df[df[0]==df[2]][0]==df[df[0]==df[2]][4])][df[df[0]==df[2]][(df[df[0]==df[2]][0]==df[df[0]==df[2]][4])][0]==df[df[0]==df[2]][(df[df[0]==df[2]][0]==df[df[0]==df[2]][4])][6]][0]==df[df[0]==df[2]][(df[df[0]==df[2]][0]==df[df[0]==df[2]][4])][df[df[0]==df[2]][(df[df[0]==df[2]][0]==df[df[0]==df[2]][4])][0]==df[df[0]==df[2]][(df[df[0]==df[2]][0]==df[df[0]==df[2]][4])][6]][8])]).index
    df['Is_suited']=0
    df['Is_suited'].iloc[same_suit_index]=1
    df = df.drop(columns=[0,2,4,6,8])
    return df
df = is_suit_conv(df)
df.iloc[[11,10,14,59,75,30,97,16,402,73,425,513,8466,19743,5,2,4]]

Unnamed: 0,1,3,5,7,9,10,Is_suited
11,6,1,13,4,9,0,0
10,1,1,9,5,3,1,0
14,8,12,9,2,2,1,0
59,5,13,13,7,5,2,0
75,4,13,1,4,1,2,0
30,13,7,11,11,11,3,0
97,9,6,9,9,1,3,0
16,4,1,3,5,2,4,0
402,11,12,10,9,13,4,0
73,8,4,2,1,7,5,1


## Sorting the Ranks of cards in order to perform further operations:

In [5]:
def sort_for_distance(df):
    na = df[[1,3,5,7,9]].values
    df_new = pd.DataFrame(np.sort(na),columns=[1,3,5,7,9])
    df_new[10] = df[10]
    df_new['Is_suited'] = df['Is_suited']
    return df_new
df = sort_for_distance(df)
df.iloc[[11,10,14,59,75,30,97,16,402,73,425,513,8466,19743,5,2,4]]

Unnamed: 0,1,3,5,7,9,10,Is_suited
11,1,4,6,9,13,0,0
10,1,1,3,5,9,1,0
14,2,2,8,9,12,1,0
59,5,5,7,13,13,2,0
75,1,1,4,4,13,2,0
30,7,11,11,11,13,3,0
97,1,6,9,9,9,3,0
16,1,2,3,4,5,4,0
402,9,10,11,12,13,4,0
73,1,2,4,7,8,5,1


### Subtracting the value of minimum rank card with other card for further operations
    This will lead to extracting new features out of the data in order to differentiate the classes

In [6]:
def extract_distances(df):
    for i in [3,5,7,9]:
        df[i] = df[i]-df[1]
    df[1] = 0
    return df
df = extract_distances(df)
df.iloc[[11,10,14,59,75,30,97,16,402,73,425,513,8466,19743,5,2,4]]

Unnamed: 0,1,3,5,7,9,10,Is_suited
11,0,3,5,8,12,0,0
10,0,0,2,4,8,1,0
14,0,0,6,7,10,1,0
59,0,0,2,8,8,2,0
75,0,0,3,3,12,2,0
30,0,4,4,4,6,3,0
97,0,5,8,8,8,3,0
16,0,1,2,3,4,4,0
402,0,1,2,3,4,4,0
73,0,1,3,6,7,5,1


### Extracting if the combination of cards is sequential into the Columns: 'Is_seq1' & 'Is_seq2'
    Is_seq1:
        1: The hand is in sequence
        0: The hand is not in sequence
    Is_seq2:
        1: The hand is in sequence with the Highest Sequence(A,K,Q,J,10)
        0: The hand is not in sequence

In [7]:
def is_sequencial_conv(df):
    chain_index = df[df[3]==1][df[df[3]==1][5]==2][df[df[3]==1][df[df[3]==1][5]==2][7]==3][df[df[3]==1][df[df[3]==1][5]==2][df[df[3]==1][df[df[3]==1][5]==2][7]==3][9]==4].index
    df['Is_Seq1']=0
    df['Is_Seq1'].iloc[chain_index]=1
    chain2_index = df[df[3]==9][df[df[3]==9][5]==10][df[df[3]==9][df[df[3]==9][5]==10][7]==11][df[df[3]==9][df[df[3]==9][5]==10][df[df[3]==9][df[df[3]==9][5]==10][7]==11][9]==12].index
    df['Is_Seq2']=0
    df['Is_Seq2'].iloc[chain2_index]=1
    return df
df = is_sequencial_conv(df)
df.iloc[[11,10,14,59,75,30,97,16,402,73,425,513,8466,19743,5,2,4]]

Unnamed: 0,1,3,5,7,9,10,Is_suited,Is_Seq1,Is_Seq2
11,0,3,5,8,12,0,0,0,0
10,0,0,2,4,8,1,0,0,0
14,0,0,6,7,10,1,0,0,0
59,0,0,2,8,8,2,0,0,0
75,0,0,3,3,12,2,0,0,0
30,0,4,4,4,6,3,0,0,0
97,0,5,8,8,8,3,0,0,0
16,0,1,2,3,4,4,0,1,0
402,0,1,2,3,4,4,0,1,0
73,0,1,3,6,7,5,1,0,0


### Extracting the mode of hand for futher feature extraction creating new Columns: 'First_mode' & 'First_count'
    First_mode:
        The mode of the Hand.
    First_count:
        The frequency of the mode.

In [8]:
def first_mode_extract(df):
    mode,count = sm.mode(df[[1,3,5,7,9]],axis=1)
    df['First_mode'] = mode.reshape(df.shape[0],)
    df['First_count'] = count.reshape(df.shape[0],)
    return df
df = first_mode_extract(df)
df.iloc[[11,10,14,59,75,30,97,16,402,73,425,513,8466,19743,5,2,4]]

Unnamed: 0,1,3,5,7,9,10,Is_suited,Is_Seq1,Is_Seq2,First_mode,First_count
11,0,3,5,8,12,0,0,0,0,0,1
10,0,0,2,4,8,1,0,0,0,0,2
14,0,0,6,7,10,1,0,0,0,0,2
59,0,0,2,8,8,2,0,0,0,0,2
75,0,0,3,3,12,2,0,0,0,0,2
30,0,4,4,4,6,3,0,0,0,4,3
97,0,5,8,8,8,3,0,0,0,8,3
16,0,1,2,3,4,4,0,1,0,0,1
402,0,1,2,3,4,4,0,1,0,0,1
73,0,1,3,6,7,5,1,0,0,0,1


### Extracting the second mode of hand creating new Column: 'Second_count'
    Second_count:
        The frequency of the second mode of the Hand.

In [9]:
def second_mode_extract(df):
    for i,j in [[1,101],[3,102],[5,103],[7,104],[9,105]]:
        dum_index=df[df[i]==df['First_mode']].index
        df[i].iloc[dum_index] = j
    mode,count = sm.mode(df[[1,3,5,7,9]],axis=1)
    df['Second_count'] = count.reshape(df.shape[0],)
    return df
df = second_mode_extract(df)
df.iloc[[11,10,14,59,75,30,97,16,402,73,425,513,8466,19743,5,2,4]]

Unnamed: 0,1,3,5,7,9,10,Is_suited,Is_Seq1,Is_Seq2,First_mode,First_count,Second_count
11,101,3,5,8,12,0,0,0,0,0,1,1
10,101,102,2,4,8,1,0,0,0,0,2,1
14,101,102,6,7,10,1,0,0,0,0,2,1
59,101,102,2,8,8,2,0,0,0,0,2,2
75,101,102,3,3,12,2,0,0,0,0,2,2
30,0,102,103,104,6,3,0,0,0,4,3,1
97,0,5,103,104,105,3,0,0,0,8,3,1
16,101,1,2,3,4,4,0,1,0,0,1,1
402,101,1,2,3,4,4,0,1,0,0,1,1
73,101,1,3,6,7,5,1,0,0,0,1,1


### Extracting only the necessary features for the model:
    The orignal hand information is converted into necessary features needed for the Decision Tree Algorithm.
    'First_mode' was used to extract the 'Second_count' and hence not needed as a feature.

In [10]:
def final_clean(df):
    df = df.drop([1,3,5,7,9,'First_mode'],1)
    return df
df = final_clean(df)
df.iloc[[11,10,14,59,75,30,97,16,402,73,425,513,8466,19743,5,2,4]]

Unnamed: 0,10,Is_suited,Is_Seq1,Is_Seq2,First_count,Second_count
11,0,0,0,0,1,1
10,1,0,0,0,2,1
14,1,0,0,0,2,1
59,2,0,0,0,2,2
75,2,0,0,0,2,2
30,3,0,0,0,3,1
97,3,0,0,0,3,1
16,4,0,1,0,1,1
402,4,0,1,0,1,1
73,5,1,0,0,1,1


### Final Function to transform a DataSet:

In [11]:
def transform_df(df):
    df = is_suit_conv(df)
    df = sort_for_distance(df)
    df = extract_distances(df)
    df = is_sequencial_conv(df)
    df = first_mode_extract(df)
    df = second_mode_extract(df)
    df = final_clean(df)
    return df

### Building Model:

In [12]:
X = df.drop([10],1)
y = df[10]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42,stratify=y)

In [13]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
print(classification_report(y_pred,y_test))
print(confusion_matrix(y_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6246
           1       1.00      1.00      1.00      5299
           2       1.00      1.00      1.00       603
           3       1.00      1.00      1.00       257
           4       1.00      1.00      1.00        47
           5       1.00      1.00      1.00        27
           6       1.00      1.00      1.00        18
           7       1.00      1.00      1.00         3
           8       1.00      1.00      1.00         3
           9       1.00      1.00      1.00         2

    accuracy                           1.00     12505
   macro avg       1.00      1.00      1.00     12505
weighted avg       1.00      1.00      1.00     12505

[[6246    0    0    0    0    0    0    0    0    0]
 [   0 5299    0    0    0    0    0    0    0    0]
 [   0    0  603    0    0    0    0    0    0    0]
 [   0    0    0  257    0    0    0    0    0    0]
 [   0    0    0    0   47   

##### You can see that the model is perfectly classifying the the Target class.
### Now to Test the same on the Test Data Set:

In [14]:
test_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1,1,1,13,2,4,2,3,1,12,0
1,3,12,3,2,3,11,4,5,2,5,1
2,1,9,4,6,1,4,3,2,3,9,1
3,1,4,3,13,2,13,2,1,3,6,1
4,3,10,2,7,1,2,2,11,4,9,0


In [15]:
test_df = transform_df(test_df)
test_df.head(5)

Unnamed: 0,10,Is_suited,Is_Seq1,Is_Seq2,First_count,Second_count
0,0,0,0,0,1,1
1,1,0,0,0,2,1
2,1,0,0,0,2,1
3,1,0,0,0,2,1
4,0,0,0,0,1,1


In [16]:
X = test_df.drop([10],1)
y = test_df[10]
y_pred = dtc.predict(X)
print(classification_report(y_pred,y))
print(confusion_matrix(y_pred,y))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    501209
           1       1.00      1.00      1.00    422498
           2       1.00      1.00      1.00     47622
           3       1.00      1.00      1.00     21121
           4       1.00      1.00      1.00      3885
           5       1.00      1.00      1.00      1996
           6       1.00      1.00      1.00      1424
           7       1.00      1.00      1.00       230
           8       1.00      1.00      1.00        12
           9       1.00      1.00      1.00         3

    accuracy                           1.00   1000000
   macro avg       1.00      1.00      1.00   1000000
weighted avg       1.00      1.00      1.00   1000000

[[501209      0      0      0      0      0      0      0      0      0]
 [     0 422498      0      0      0      0      0      0      0      0]
 [     0      0  47622      0      0      0      0      0      0      0]
 [     0      0      0

### As expected, The model scores perfectly with 100% accuracy.