In [1]:
import os, json
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

os.makedirs('../artifacts', exist_ok=True)

## Load csv and basic checks
read only needed columns drop duplicates assert no missing values

In [2]:
usecols = ['suit1','suit2','suit3','suit4','suit5','rank1','rank2','rank3','rank4','rank5','ranking']
df = pd.read_csv('../dataset/poker_hands.csv', usecols=usecols)
df.drop_duplicates(inplace=True)
assert df.isnull().sum().sum() == 0

## Encode target labels
turn ranking names into integer class ids

In [3]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['ranking'].to_numpy())

## Convert suit strings to integer codes
factorize all suits once to keep mapping stable

In [4]:
rcols = ['rank1','rank2','rank3','rank4','rank5']
scols = ['suit1','suit2','suit3','suit4','suit5']
ranks = df[rcols].to_numpy(dtype=np.int16, copy=False)
suits_str = df[scols].to_numpy(copy=False)

s_flat = suits_str.ravel()
s_codes, s_uniques = pd.factorize(s_flat, sort=True)
s_codes = s_codes.reshape(-1, 5).astype(np.int16, copy=False)
suit_classes = s_uniques.astype(str).tolist()
num_suits = len(suit_classes)
num_ranks = 13


## Sort the five cards inside each hand
sort by rank then suit so card order does not matter

In [5]:
key = ranks * 10 + s_codes
order = np.argsort(key, axis=1)
ranks_sorted = np.take_along_axis(ranks,   order, axis=1)
suits_sorted = np.take_along_axis(s_codes, order, axis=1)

## One hot encode suits and ranks
use numpy eye for fast one hot

each card in a poker hand is converted into a one-hot vector so the neural network can process categorical data numerically.

the ranks (1–13) are encoded into vectors of length 13, giving an array of shape (n, 5, 13) for n hands and 5 cards per hand.

the suits (4 types) are encoded into vectors of length 4, giving an array of shape (n, 5, 4).

this converts each hand into a structured numeric format suitable for input to the model.

In [6]:
r_oh = np.eye(num_ranks, dtype=np.uint8)[ranks_sorted - 1]   # n 5 13
s_oh = np.eye(num_suits, dtype=np.uint8)[suits_sorted]       # n 5 k

## Build final feature matrix
flatten five cards and concat suit one hot with rank one hot

we need this because neural networks can only work with numbers in a single flat vector and not text or categories

each poker hand has 5 cards, and every card has a suit and a rank
but those are categorical values like "hearts" or 7

so we:

turn each suit and rank into one-hot numbers (so the model can understand categories)

flatten all those one-hot values for the 5 cards into one long row of numbers

combine them all into a big matrix where every row = one hand and every column = one feature

this final matrix X is what we train the model on, it’s the numeric representation of all poker hands

In [7]:
X = np.concatenate([
    s_oh.reshape(s_oh.shape[0], -1),
    r_oh.reshape(r_oh.shape[0], -1)
], axis=1).astype(np.uint8, copy=False)

## Split into train val test with stratify
The data is divided into three parts so the model can be trained and tested fairly:

train: used to teach the model (70% of the data)

val: used to tune model settings during training (15%)

test: used only at the end to check real performance (15%)

stratify=y keeps the same class balance (same ratio of poker hand types) in each split, so no class is missing or over-represented in any subset.

In [8]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val,   X_test, y_val, y_test   = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

## Balance the training set
Some poker hand types appear far more often than others, which can make the model biased toward common hands.
This code balances the training set so each class has the same number of examples:

TARGET_PER_CLASS = 2000 sets how many examples each class should have.

If a class has more than 2000 examples, it randomly removes extras (undersampling).

If a class has fewer, it randomly duplicates existing ones (oversampling).

All selected examples are then shuffled to mix the classes evenly.

In [9]:
TARGET_PER_CLASS = 2000
rng = np.random.default_rng(42)
classes = np.unique(y_train)
idx_by_class = {c: np.where(y_train == c)[0] for c in classes}

new_idx = []
for c in classes:
    idx = idx_by_class[c]
    if len(idx) >= TARGET_PER_CLASS:
        pick = rng.choice(idx, size=TARGET_PER_CLASS, replace=False)
    else:
        pick = rng.choice(idx, size=TARGET_PER_CLASS, replace=True)
    new_idx.append(pick)
new_idx = np.concatenate(new_idx)
rng.shuffle(new_idx)

X_train_bal = X_train[new_idx].astype(np.uint8, copy=False)
y_train_bal = y_train[new_idx]

## Save arrays we will use later
save balanced train and raw val and test

In [10]:
np.save('../artifacts/X_train_balanced.npy', X_train_bal)
np.save('../artifacts/y_train_balanced.npy', y_train_bal)
np.save('../artifacts/X_val.npy', X_val.astype(np.uint8, copy=False))
np.save('../artifacts/y_val.npy', y_val)
np.save('../artifacts/X_test.npy', X_test.astype(np.uint8, copy=False))
np.save('../artifacts/y_test.npy', y_test)

## Save encoder and metadata
write label classes suit classes and sizes for later use

In [11]:
joblib.dump(label_encoder, '../artifacts/label_encoder.joblib')

meta = {
    'label_classes': label_encoder.classes_.tolist(),
    'suit_classes': suit_classes,
    'num_suits': int(num_suits),
    'num_ranks': int(num_ranks),
    'x_dim': int(X.shape[1]),
    'split_sizes': {'val': int(len(X_val)), 'test': int(len(X_test))},
    'balanced_train': {'target_per_class': int(TARGET_PER_CLASS), 'size': int(len(y_train_bal))},
    'random_state': 42
}
with open('../artifacts/metadata.json','w') as f:
    json.dump(meta, f, indent=2)

## Quick summary prints
confirm feature size and class balance

In [12]:
u,c = np.unique(y_train_bal, return_counts=True)
print('done fast prep')
print('x dim', X.shape[1])
print('balanced counts', dict(zip([int(i) for i in u], [int(j) for j in c])))

done fast prep
x dim 85
balanced counts {0: 2000, 1: 2000, 2: 2000, 3: 2000, 4: 2000, 5: 2000, 6: 2000, 7: 2000, 8: 2000, 9: 2000}
