In [3]:
import sys, os
sys.path.append(os.path.abspath(os.path.join('..')))  # makes ../util visible

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib
import json
import matplotlib.pyplot as plt
from util.folder_structure_builder import build_part_1

In [4]:
# load dataset
df = pd.read_csv('../../dataset/poker_hands.csv')

# basic checks
df.drop_duplicates(inplace=True)
assert df.isnull().sum().sum() == 0

# encode suits once and apply to all suit columns
suit_encoder = LabelEncoder()
suit_columns = ['suit1', 'suit2', 'suit3', 'suit4', 'suit5']
suit_encoder.fit(pd.concat([df[col] for col in suit_columns]))
for col in suit_columns:
    df[col] = suit_encoder.transform(df[col])

# encode target labels to ints
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['ranking'])

# make a barchart of class counts using original ranking names
# this shows the class imbalance
counts = df['ranking'].value_counts()
counts = counts.reindex(label_encoder.classes_, fill_value=0)
plt.figure(figsize=(10,4))
plt.bar(counts.index, counts.values)
plt.xticks(rotation=45, ha='right')
plt.ylabel('count')
plt.title('ranking counts in dataset')
plt.tight_layout()
plt.savefig('../artifacts/ranking_counts.png', dpi=150)
plt.close()

# features without target
X = df.drop('ranking', axis=1)

# split train val test with stratify
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

# save arrays as npy
np.save('../artifacts/X_train.npy', X_train.values)
np.save('../artifacts/X_val.npy', X_val.values)
np.save('../artifacts/X_test.npy', X_test.values)
np.save('../artifacts/y_train.npy', y_train)
np.save('../artifacts/y_val.npy', y_val)
np.save('../artifacts/y_test.npy', y_test)

# save encoders with joblib extension
joblib.dump(suit_encoder, '../artifacts/suit_encoder.joblib')
joblib.dump(label_encoder, '../artifacts/label_encoder.joblib')

# save metadata json
metadata = {
    "columns": list(X.columns),
    "label_classes": list(label_encoder.classes_),
    "suit_classes": list(suit_encoder.classes_),
    "split_sizes": {
        "train": len(X_train),
        "val": len(X_val),
        "test": len(X_test)
    },
    "random_state": 42
}
with open('../artifacts/metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

# summary prints
print("train shape", X_train.shape)
print("val shape", X_val.shape)
print("test shape", X_test.shape)
print("class names", label_encoder.classes_)


train shape (715939, 10)
val shape (153416, 10)
test shape (153416, 10)
class names ['flush' 'four_of_a_kind' 'full_house' 'nothing' 'one_pair' 'royal_flush'
 'straight' 'straight_flush' 'three_of_a_kind' 'two_pairs']
