In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

data_dir = 'dataset_cleaned'
output_dir = 'split_data'
os.makedirs(output_dir, exist_ok=True)

# 1. Combine all CSVs into one DataFrame
all_dfs = []
for filename in os.listdir(data_dir):
    if filename.lower().endswith('.csv'):
        filepath = os.path.join(data_dir, filename)
        df = pd.read_csv(filepath)
        all_dfs.append(df)

combined_df = pd.concat(all_dfs, ignore_index=True)
del all_dfs  # free memory

# 2. Shuffle the entire combined dataset
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# 3. Split into train, val, test (80/10/10 example)
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

train_df, temp_df = train_test_split(combined_df, test_size=(1 - train_ratio), random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=(test_ratio/(test_ratio+val_ratio)), random_state=42)

del combined_df, temp_df  # free memory

# 4. Save the splits
train_path = os.path.join(output_dir, 'train.csv')
val_path = os.path.join(output_dir, 'val.csv')
test_path = os.path.join(output_dir, 'test.csv')

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)

print("Data has been successfully split into train/val/test and saved in 'split_data' directory.")
print(f"Train size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")

Data has been successfully split into train/val/test and saved in 'split_data' directory.
Train size: 313034, Val size: 39129, Test size: 39130


In [4]:
import tensorflow as tf
from tensorflow.keras import layers, Model, Input
import os
import pandas as pd
import numpy as np
from tensorflow.keras.optimizers import Adam

##############################################
# Parameters & Setup
##############################################
seq_len = 10
embedding_dim = 8
batch_size = 64
data_dir = 'split_data'  # Directory containing train.csv, val.csv, test.csv

player_columns = [
    "OFF_PLAYER1_ID", "OFF_PLAYER2_ID", "OFF_PLAYER3_ID", "OFF_PLAYER4_ID", "OFF_PLAYER5_ID",
    "DEF_PLAYER1_ID", "DEF_PLAYER2_ID", "DEF_PLAYER3_ID", "DEF_PLAYER4_ID", "DEF_PLAYER5_ID"
]

main_out_column = "OUTCOME"
second_chance_column = "SECOND_CHANCE"
unwanted_cols = ["SHOOTER_ID","ASSISTER_ID","BLOCKER_ID","STEALER_ID","REBOUNDER_ID","TURNOVER_ID"]

train_path = os.path.join(data_dir, 'train.csv')
val_path = os.path.join(data_dir, 'val.csv')
test_path = os.path.join(data_dir, 'test.csv')

##############################################
# Build Player ID Mapping
##############################################
def gather_unique_player_ids_from_csvs(file_paths, player_cols):
    unique_ids = set()
    for fpath in file_paths:
        df = pd.read_csv(fpath, usecols=player_cols)
        df = df.dropna(subset=player_cols)
        for col in player_cols:
            unique_ids.update(df[col].dropna().astype(int).unique())
    return unique_ids

all_files = [train_path, val_path, test_path]
all_unique_ids = gather_unique_player_ids_from_csvs(all_files, player_columns)

unique_players = np.sort(list(all_unique_ids))
player_to_index = {p: i for i, p in enumerate(unique_players)}
v = len(unique_players)
print(f"Number of unique players: {v}")
print("Example mapping:", list(player_to_index.items())[:10])

##############################################
# Model Definition Using Embeddings
##############################################
input_players = Input(shape=(seq_len,), dtype='int32', name='players_input')
player_embedding = layers.Embedding(input_dim=v, output_dim=embedding_dim, name='player_embedding')(input_players)

offense_emb = layers.Lambda(lambda t: t[:, :5, :], name='offense_slice')(player_embedding)
defense_emb = layers.Lambda(lambda t: t[:, 5:, :], name='defense_slice')(player_embedding)

off_mean = layers.Lambda(lambda t: tf.reduce_mean(t, axis=1), name='off_mean')(offense_emb)
def_mean = layers.Lambda(lambda t: tf.reduce_mean(t, axis=1), name='def_mean')(defense_emb)

concat = layers.Concatenate(name='concat')([off_mean, def_mean])
hidden = layers.Dense(128, activation='relu', name='hidden')(concat)

main_out = layers.Dense(14, activation='softmax', name='main_out')(hidden)
second_chance_out = layers.Dense(1, activation='sigmoid', name='second_chance_out')(hidden)

optimizer = Adam(learning_rate=0.0001)

model = Model(inputs=input_players, outputs=[main_out, second_chance_out])
model.compile(
    optimizer=optimizer,
    loss={
        'main_out': 'categorical_crossentropy',
        'second_chance_out': 'binary_crossentropy'
    },
    metrics={
        'main_out': 'accuracy',
        'second_chance_out': 'accuracy'
    }
)

model.summary()

##############################################
# Utility Functions for tf.data Pipeline
##############################################
def shard_generator(filepath, main_col, sc_col, mapping):
    df = pd.read_csv(filepath)

    # Drop rows with NaNs in player columns
    df = df.dropna(subset=player_columns)

    # Convert players to int
    for col in player_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

    # Drop unwanted columns
    for c in unwanted_cols:
        if c in df.columns:
            df.drop(columns=c, inplace=True)

    # One-hot the outcome column
    categories = sorted(df[main_col].unique())
    cat_to_idx = {cat: i for i, cat in enumerate(categories)}

    num_samples = len(df)
    y_main = np.zeros((num_samples, 14), dtype='float32')
    for i, val in enumerate(df[main_col]):
        class_idx = cat_to_idx[val]
        y_main[i, class_idx] = 1.0

    y_sc = df[sc_col].astype(int).values.reshape(-1, 1)

    # Drop target columns now
    df.drop(columns=[main_col, sc_col], inplace=True)

    # Map player IDs
    for c in player_columns:
        df[c] = df[c].map(mapping)

    X = df[player_columns].values.astype(np.int32)

    for i in range(num_samples):
        yield X[i], (y_main[i], y_sc[i])

def create_dataset(filepath, main_col, sc_col, batch_size, mapping, shuffle_buffer=10000):
    ds = tf.data.Dataset.from_generator(
        lambda: shard_generator(filepath, main_col, sc_col, mapping),
        output_types=(tf.int32, (tf.float32, tf.float32)),
        output_shapes=((seq_len,), ((14,), (1,)))
    )

    ds = ds.shuffle(shuffle_buffer)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(tf.data.AUTOTUNE)

    # Apply .repeat() so the dataset doesn't run out of data and ends training prematurely
    ds = ds.repeat()

    return ds

##############################################
# Create Datasets using mapping
##############################################
train_ds = create_dataset(train_path, main_out_column, second_chance_column, batch_size, player_to_index)
val_ds = create_dataset(val_path, main_out_column, second_chance_column, batch_size, player_to_index)
test_ds = create_dataset(test_path, main_out_column, second_chance_column, batch_size, player_to_index)

##############################################
# Training with tf.data
##############################################
# Since ds is repeated infinitely, we must specify steps_per_epoch and validation_steps
# so model knows when to stop each epoch.
# Estimate the number of steps per epoch from train set size
# If you know the number of rows in train.csv, use it. Let's say N_train is known:
N_train = sum(1 for _ in open(train_path)) - 1  # Rough counting lines excluding header
steps_per_epoch = N_train // batch_size

N_val = sum(1 for _ in open(val_path)) - 1
validation_steps = N_val // batch_size

model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,
    steps_per_epoch=steps_per_epoch,
    validation_steps=validation_steps
)

##############################################
# Evaluation
##############################################
N_test = sum(1 for _ in open(test_path)) - 1
test_steps = N_test // batch_size
model.evaluate(test_ds, steps=test_steps)

Number of unique players: 748
Example mapping: [(2544, 0), (101108, 1), (200768, 2), (200782, 3), (201142, 4), (201143, 5), (201144, 6), (201145, 7), (201152, 8), (201565, 9)]


Epoch 1/10
[1m4891/4891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 5ms/step - loss: 2.6568 - main_out_accuracy: 0.3521 - main_out_loss: 2.2176 - second_chance_out_accuracy: 0.8846 - second_chance_out_loss: 0.4391 - val_loss: 2.4004 - val_main_out_accuracy: 0.3608 - val_main_out_loss: 2.0471 - val_second_chance_out_accuracy: 0.8874 - val_second_chance_out_loss: 0.3533
Epoch 2/10
[1m4891/4891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 5ms/step - loss: 2.4051 - main_out_accuracy: 0.3592 - main_out_loss: 2.0473 - second_chance_out_accuracy: 0.8850 - second_chance_out_loss: 0.3571 - val_loss: 2.3841 - val_main_out_accuracy: 0.3609 - val_main_out_loss: 2.0328 - val_second_chance_out_accuracy: 0.8874 - val_second_chance_out_loss: 0.3513
Epoch 3/10
[1m4891/4891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 5ms/step - loss: 2.3899 - main_out_accuracy: 0.3591 - main_out_loss: 2.0345 - second_chance_out_accuracy: 0.8853 - second_chance_out_loss: 0.3552 - val_lo

KeyboardInterrupt: 