In [3]:
import tensorflow as tf
from tensorflow.keras import layers, Model, Input
import os
import pandas as pd
import numpy as np
from tensorflow.keras.optimizers import Adam, SGD

##############################################
# Parameters & Setup
##############################################
seq_len = 10
embedding_dim = 8
batch_size = 64
data_dir = 'split_data_parquet'

player_columns = [
    "OFF_PLAYER1_ID", "OFF_PLAYER2_ID", "OFF_PLAYER3_ID", "OFF_PLAYER4_ID", "OFF_PLAYER5_ID",
    "DEF_PLAYER1_ID", "DEF_PLAYER2_ID", "DEF_PLAYER3_ID", "DEF_PLAYER4_ID", "DEF_PLAYER5_ID"
]

main_out_column = "OUTCOME"
second_chance_column = "SECOND_CHANCE"
unwanted_cols = ["SHOOTER_ID","ASSISTER_ID","BLOCKER_ID","STEALER_ID","REBOUNDER_ID","TURNOVER_ID"]

# Identify shard files
train_files = sorted([os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.startswith('train_') and f.endswith('.parquet')])
val_files = sorted([os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.startswith('val_') and f.endswith('.parquet')])
test_files = sorted([os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.startswith('test_') and f.endswith('.parquet')])

##############################################
# Build Player ID Mapping
##############################################
def gather_unique_player_ids(files, player_cols):
    unique_ids = set()
    for fpath in files:
        print(f"Scanning file for unique IDs: {fpath}")
        df = pd.read_parquet(fpath, columns=player_cols)  # load only player columns
        df = df.dropna(subset=player_cols)
        for col in player_cols:
            unique_ids.update(df[col].dropna().astype(int).unique())
    return unique_ids

all_files = train_files + val_files + test_files
all_unique_ids = gather_unique_player_ids(all_files, player_columns)

unique_players = np.sort(list(all_unique_ids))
player_to_index = {p: i for i, p in enumerate(unique_players)}
v = len(unique_players)
print(f"Number of unique players: {v}")
print("Example mapping:", list(player_to_index.items())[:10])

##############################################
# Model Definition Using Embeddings (with updated v)
##############################################
input_players = Input(shape=(seq_len,), dtype='int32', name='players_input')
player_embedding = layers.Embedding(input_dim=v, output_dim=embedding_dim, name='player_embedding')(input_players)

offense_emb = layers.Lambda(lambda t: t[:, :5, :], name='offense_slice')(player_embedding)
defense_emb = layers.Lambda(lambda t: t[:, 5:, :], name='defense_slice')(player_embedding)

off_mean = layers.Lambda(lambda t: tf.reduce_mean(t, axis=1), name='off_mean')(offense_emb)
def_mean = layers.Lambda(lambda t: tf.reduce_mean(t, axis=1), name='def_mean')(defense_emb)

concat = layers.Concatenate(name='concat')([off_mean, def_mean])
hidden = layers.Dense(128, activation='relu', name='hidden')(concat)

main_out = layers.Dense(14, activation='softmax', name='main_out')(hidden)
second_chance_out = layers.Dense(1, activation='sigmoid', name='second_chance_out')(hidden)

# Set a custom learning rate
# optimizer = Adam(learning_rate=0.0001)
optimizer = SGD(learning_rate=0.01, momentum=0.9)

model = Model(inputs=input_players, outputs=[main_out, second_chance_out])
model.compile(
    optimizer=optimizer,
    loss={
        'main_out': 'categorical_crossentropy',
        'second_chance_out': 'binary_crossentropy'
    },
    metrics={
        'main_out': 'accuracy',
        'second_chance_out': 'accuracy'
    }
)

model.summary()

##############################################
# Utility Functions for tf.data Pipeline
##############################################
def shard_generator(file_list, main_col, sc_col, mapping):
    """
    Yields individual samples (X, (y_main, y_sc)) from shard files.
    Applies player_to_index mapping to ensure IDs are in [0, v-1].
    """
    for fpath in file_list:
        print(f"Loading shard: {fpath}")
        df = pd.read_parquet(fpath)

        # Drop rows with NaNs in player columns
        df = df.dropna(subset=player_columns)

        # Convert players to int
        for col in player_columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

        # Drop unwanted columns
        for c in unwanted_cols:
            if c in df.columns:
                df.drop(columns=c, inplace=True)

        # One-hot the outcome column
        categories = sorted(df[main_col].unique())
        cat_to_idx = {cat: i for i, cat in enumerate(categories)}

        num_samples = len(df)
        y_main = np.zeros((num_samples, 14), dtype='float32')
        for i, val in enumerate(df[main_col]):
            class_idx = cat_to_idx[val]
            y_main[i, class_idx] = 1.0

        y_sc = df[sc_col].astype(int).values.reshape(-1, 1)

        # Drop target columns now
        df.drop(columns=[main_col, sc_col], inplace=True)

        # Map player IDs
        for c in player_columns:
            df[c] = df[c].map(mapping)

        X = df[player_columns].values.astype(np.int32)

        # Yield each sample
        for i in range(num_samples):
            yield X[i], (y_main[i], y_sc[i])

def create_dataset(file_list, main_col, sc_col, batch_size, mapping, shuffle_buffer=10000):
    ds = tf.data.Dataset.from_generator(
        lambda: shard_generator(file_list, main_col, sc_col, mapping),
        output_types=(tf.int32, (tf.float32, tf.float32)),
        output_shapes=((seq_len,), ((14,), (1,)))
    )

    ds = ds.shuffle(shuffle_buffer)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

##############################################
# Create Datasets using mapping
##############################################
train_ds = create_dataset(train_files, main_out_column, second_chance_column, batch_size, player_to_index)
val_ds = create_dataset(val_files, main_out_column, second_chance_column, batch_size, player_to_index)
test_ds = create_dataset(test_files, main_out_column, second_chance_column, batch_size, player_to_index)

##############################################
# Training with tf.data
##############################################
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=2
)

##############################################
# Evaluation
##############################################
model.evaluate(test_ds)

Scanning file for unique IDs: split_data_parquet/train_000.parquet
Scanning file for unique IDs: split_data_parquet/train_001.parquet
Scanning file for unique IDs: split_data_parquet/train_002.parquet
Scanning file for unique IDs: split_data_parquet/train_003.parquet
Scanning file for unique IDs: split_data_parquet/train_004.parquet
Scanning file for unique IDs: split_data_parquet/train_005.parquet
Scanning file for unique IDs: split_data_parquet/train_006.parquet
Scanning file for unique IDs: split_data_parquet/train_007.parquet
Scanning file for unique IDs: split_data_parquet/train_008.parquet
Scanning file for unique IDs: split_data_parquet/train_009.parquet
Scanning file for unique IDs: split_data_parquet/val_000.parquet
Scanning file for unique IDs: split_data_parquet/val_001.parquet
Scanning file for unique IDs: split_data_parquet/test_000.parquet
Scanning file for unique IDs: split_data_parquet/test_001.parquet
Number of unique players: 1543
Example mapping: [(1713, 0), (2199, 1

Epoch 1/2
Loading shard: split_data_parquet/train_000.parquet
   2052/Unknown [1m10s[0m 4ms/step - loss: 2.6201 - main_out_accuracy: 0.3676 - main_out_loss: 2.1997 - second_chance_out_accuracy: 0.8579 - second_chance_out_loss: 0.4204Loading shard: split_data_parquet/train_001.parquet
   4261/Unknown [1m19s[0m 4ms/step - loss: 2.5991 - main_out_accuracy: 0.3676 - main_out_loss: 2.1856 - second_chance_out_accuracy: 0.8589 - second_chance_out_loss: 0.4135Loading shard: split_data_parquet/train_002.parquet
   6478/Unknown [1m28s[0m 4ms/step - loss: 2.5922 - main_out_accuracy: 0.3672 - main_out_loss: 2.1812 - second_chance_out_accuracy: 0.8592 - second_chance_out_loss: 0.4111Loading shard: split_data_parquet/train_003.parquet
   8689/Unknown [1m37s[0m 4ms/step - loss: 2.5886 - main_out_accuracy: 0.3669 - main_out_loss: 2.1789 - second_chance_out_accuracy: 0.8594 - second_chance_out_loss: 0.4097Loading shard: split_data_parquet/train_004.parquet
  10901/Unknown [1m47s[0m 4ms/step -

2024-12-09 14:19:33.495162: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


Loading shard: split_data_parquet/val_001.parquet
[1m22126/22126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 5ms/step - loss: 2.5792 - main_out_accuracy: 0.3666 - main_out_loss: 2.1722 - second_chance_out_accuracy: 0.8598 - second_chance_out_loss: 0.4070 - val_loss: 2.5569 - val_main_out_accuracy: 0.3679 - val_main_out_loss: 2.1478 - val_second_chance_out_accuracy: 0.8582 - val_second_chance_out_loss: 0.4082
Epoch 2/2


2024-12-09 14:19:44.763712: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


Loading shard: split_data_parquet/train_000.parquet
[1m 2055/22126[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1:24[0m 4ms/step - loss: 2.5491 - main_out_accuracy: 0.3699 - main_out_loss: 2.1435 - second_chance_out_accuracy: 0.8597 - second_chance_out_loss: 0.4056Loading shard: split_data_parquet/train_001.parquet
[1m 4268/22126[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m1:15[0m 4ms/step - loss: 2.5503 - main_out_accuracy: 0.3690 - main_out_loss: 2.1449 - second_chance_out_accuracy: 0.8598 - second_chance_out_loss: 0.4054Loading shard: split_data_parquet/train_002.parquet
[1m 6473/22126[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1:06[0m 4ms/step - loss: 2.5517 - main_out_accuracy: 0.3682 - main_out_loss: 2.1463 - second_chance_out_accuracy: 0.8598 - second_chance_out_loss: 0.4053Loading shard: split_data_parquet/train_003.parquet
[1m 8686/22126[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m57s[0m 4ms/step - loss: 2.5523 - main_out_accuracy: 0.3677 - main_out_loss: 2



Loading shard: split_data_parquet/val_001.parquet
[1m22126/22126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 5ms/step - loss: 2.5523 - main_out_accuracy: 0.3669 - main_out_loss: 2.1475 - second_chance_out_accuracy: 0.8600 - second_chance_out_loss: 0.4048 - val_loss: 2.5511 - val_main_out_accuracy: 0.3679 - val_main_out_loss: 2.1426 - val_second_chance_out_accuracy: 0.8582 - val_second_chance_out_loss: 0.4075


2024-12-09 14:21:32.381126: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


Loading shard: split_data_parquet/test_000.parquet
   1218/Unknown [1m6s[0m 5ms/step - loss: 2.5522 - main_out_accuracy: 0.3633 - main_out_loss: 2.1478 - second_chance_out_accuracy: 0.8600 - second_chance_out_loss: 0.4045Loading shard: split_data_parquet/test_001.parquet
[1m2766/2766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - loss: 2.5500 - main_out_accuracy: 0.3653 - main_out_loss: 2.1452 - second_chance_out_accuracy: 0.8598 - second_chance_out_loss: 0.4048




[2.5476553440093994,
 2.142456293106079,
 0.40430155396461487,
 0.3672352135181427,
 0.8599838614463806]

In [4]:
import pandas as pd

# After training the model and having player_to_index, unique_players defined:
embedding_matrix = model.get_layer('player_embedding').get_weights()[0]

# Create a DataFrame for player embeddings
# Columns: ['player_id', 'embed_0', 'embed_1', ..., 'embed_{embedding_dim-1}']
columns = ['player_id'] + [f'embed_{d}' for d in range(embedding_matrix.shape[1])]

data = []
for p, i in player_to_index.items():
    # embedding_matrix[i] is the embedding vector for player p
    row = [p] + embedding_matrix[i].tolist()
    data.append(row)

df = pd.DataFrame(data, columns=columns)

# Save to a CSV file
output_csv = 'player_embeddings.csv'
df.to_csv(output_csv, index=False)

print(f"Saved player embeddings to {output_csv}")

Saved player embeddings to player_embeddings.csv


In [None]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from nba_api.stats.static import players
from adjustText import adjust_text

# Load the embeddings from the CSV
df = pd.read_csv('player_embeddings.csv')  # Ensure the CSV path is correct

player_ids = df['player_id'].values
# Extract embedding columns (all except 'player_id')
embedding_cols = [c for c in df.columns if c.startswith('embed_')]
vectors = df[embedding_cols].values  # shape (num_players, embedding_dim)

# Run TSNE on vectors
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
tsne_coords = tsne.fit_transform(vectors)

x_coords = tsne_coords[:, 0]
y_coords = tsne_coords[:, 1]

# Get NBA players list and create a dict id->full_name if your player_id matches nba_api ids
all_nba_players = players.get_players()
id_to_name = {p['id']: p['full_name'] for p in all_nba_players}

plt.figure(figsize=(10, 7))
plt.scatter(x_coords, y_coords, s=20)

# We'll store text objects separately and then call adjust_text
texts = []

# Label the first 10 points (or any other selection logic)
for i, pid in enumerate(player_ids):
    if i < 10:
        label = id_to_name.get(pid, str(pid))
        # Instead of plt.text directly, we add them to a list
        text_obj = plt.text(x_coords[i], y_coords[i], label, fontsize=9)
        texts.append(text_obj)

plt.title("Player Embeddings (t-SNE 2D Projection)")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.grid(True)

# Adjust text positions to avoid overlap
adjust_text(texts, x=x_coords[:10], y=y_coords[:10], arrowprops=dict(arrowstyle='->', color='red', lw=0.5))

plt.show()