<a href="https://colab.research.google.com/github/cjannun/NBA-Shot-Prediction-Model/blob/main/NN_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# imports
from google.colab import files
import tarfile
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers, Input
from tensorflow.keras.layers import Dense, Dropout

In [None]:
# get shot data (all_seasons_final.tgz)
uploaded = files.upload()

Saving all_seasons_final.tgz to all_seasons_final.tgz


In [None]:
# extract data
# extracted_dir = './extracted'

# if not os.path.exists(extracted_dir):
#     os.makedirs(extracted_dir)

# for filename in uploaded.keys():
#     with tarfile.open(filename, 'r:gz') as tar:
#         tar.extractall(extracted_dir)

In [None]:
# # concat all files
# csv_files = [os.path.join(extracted_dir, f) for f in os.listdir(extracted_dir) if f.endswith('.csv')]
# combined_df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)
# combined_df.to_csv('all_seasons.csv', index=False)

# # compress
# with tarfile.open('all_seasons.tgz', 'w:gz') as tar:
#     tar.add('all_seasons.csv', arcname=os.path.basename('all_seasons.csv'))

In [None]:
# extract all_seasons
with tarfile.open('all_seasons_final.tgz', 'r:gz') as tar:
  tar.extractall()

In [None]:
combined_df = pd.read_csv('all_seasons.csv')
print(combined_df.shape)

(2284599, 19)


In [None]:
# Year to start using data from (data available from 2013-2023 seasons)
start_year = 2017
combined_df = combined_df[combined_df['year'] >= start_year]

# remove units, etc from data
combined_df['x'] = combined_df['x'].str.replace('px', '').astype(int)
combined_df['y'] = combined_df['y'].str.replace('px', '').astype(int)

combined_df = combined_df[combined_df['distance'].str.contains('ft', na=False)]
combined_df['distance'] = combined_df['distance'].str.replace('ft', '').astype(int)

bad_values = ['leads', 'trails', 'tied']
combined_df = combined_df[
    ~combined_df['winner_score'].isin(bad_values) &
    ~combined_df['loser_score'].isin(bad_values)
]

In [None]:
# Change types to help model and use less RAM
combined_df['winner_score'] = pd.to_numeric(combined_df['winner_score'], errors='raise', downcast='integer')
combined_df['loser_score'] = pd.to_numeric(combined_df['loser_score'], errors='raise', downcast='integer')
combined_df['quarter'] = pd.to_numeric(combined_df['quarter'], errors='raise', downcast='integer')
combined_df['x'] = pd.to_numeric(combined_df['x'], errors='raise', downcast='integer')
combined_df['y'] = pd.to_numeric(combined_df['y'], errors='raise', downcast='integer')
combined_df['distance'] = pd.to_numeric(combined_df['distance'], errors='raise', downcast='integer')
combined_df['year'] = pd.to_numeric(combined_df['year'], errors='raise', downcast='integer')
combined_df['month'] = pd.to_numeric(combined_df['month'], errors='raise', downcast='integer')
combined_df['day'] = pd.to_numeric(combined_df['day'], errors='raise', downcast='integer')
combined_df['time_remaining'] = pd.to_datetime(combined_df['time_remaining'], format='%M:%S.%f').dt.time

# convert to seconds
combined_df['time_remaining'] = combined_df['time_remaining'].apply(
    lambda x: x.minute * 60 + x.second + x.microsecond / 1e6
).astype(int)

# print(combined_df.memory_usage())

In [None]:
# create 2 new columns: shooter_score, opponent_score
# (vectorized version)
combined_df['shooter_score'] = combined_df['winner_score'].where(
    combined_df['team'] == combined_df['winner'],
    # otherwise set to loser's score
    combined_df['loser_score']
)

combined_df['opponent_score'] = combined_df['loser_score'].where(
    combined_df['team'] == combined_df['winner'],
    # otherwise set to winner's score
    combined_df['winner_score']
)

# Remove bad score values ~45000 rows
bad_values = ['leads', 'trails', 'tied']
filtered_df = combined_df[
    ~combined_df['shooter_score'].isin(bad_values) &
    ~combined_df['opponent_score'].isin(bad_values)
]

# Remove bad outcome values ~9500 rows
good_outcomes = ['made', 'missed']
filtered_df = filtered_df[
    ~(~filtered_df['outcome'].isin(good_outcomes))
]

# Convert outcome to boolean
filtered_df.loc[:, 'outcome'] = filtered_df['outcome'].map({'made': 1, 'missed': 0})

filtered_df['outcome'] = pd.to_numeric(filtered_df['outcome'], errors='raise', downcast='integer')

# Remove bad attempt values ~9000 rows
good_attempts = ['2-pointer', '3-pointer']
filtered_df = filtered_df[
    ~(~filtered_df['attempt'].isin(good_attempts))
]

filtered_df['shooter_score'] = pd.to_numeric(filtered_df['shooter_score'], errors='raise', downcast='integer')
filtered_df['opponent_score'] = pd.to_numeric(filtered_df['opponent_score'], errors='raise', downcast='integer')

# We want the score BEFORE the shot is made
filtered_df.loc[filtered_df['outcome'] == 'made', 'shooter_score'] -= filtered_df['attempt'].map({
    '2-pointer': 2,
    '3-pointer': 3,
})

filtered_df['y'] = abs(filtered_df['y'] - 240)



In [None]:
# encoding
# change 'attempt' to '3-pointer', with '1' for 3-pointers and '0' for 2-pointers
filtered_df.loc[:, '3-pointer'] = filtered_df['attempt'].map({
    '3-pointer': 1,
    '2-pointer': 0,
})

shot_counts = filtered_df['shots_by'].value_counts()
threshold = shot_counts.quantile(.99)

# if shot_counts for a player is less than threshold, change their name to other
# filtered_df['shots_by'] = filtered_df['shots_by'].where(
#     filtered_df['shots_by'].map(shot_counts) >= threshold,
#     'Other'
# )

# if shot_counts for a player is less than 25th percentile, remove from dataset
# filtered_df = filtered_df[filtered_df['shots_by'].map(shot_counts) >= threshold]

# apply one-hot on the names of shooting players
# one_hotted_df = pd.get_dummies(filtered_df, columns=['shots_by'])
# shooter_columns = [
#     col for col in one_hotted_df.columns if col.startswith('shots_by_')
# ]

# apply one-hot on team name
one_hotted_df = pd.get_dummies(filtered_df, columns=['team'])
team_columns = [
    col for col in one_hotted_df.columns if col.startswith('team_')
]

In [None]:
# select columns and split data into train/test
# x = one_hotted_df[['x', 'y', '3-pointer', 'distance', 'time_remaining'] + shooter_columns]
x = one_hotted_df[['x', 'y'] + team_columns]
y = one_hotted_df['outcome']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

print(f"x_train shape: {x_train.shape}")
print(f"x_test shape: {x_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

x_train shape: (1206456, 32)
x_test shape: (301614, 32)
y_train shape: (1206456,)
y_test shape: (301614,)


In [None]:
# Scale data to help linear regression model converge
scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
# x_train_scaled = x_train
# x_test_scaled = x_test

In [None]:
# create a linear regression model as a baseline
baseline_model = LogisticRegression()
baseline_model.fit(x_train_scaled, y_train)

In [None]:
# Evaluate accuracy
y_pred = baseline_model.predict(x_test_scaled)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.6075
              precision    recall  f1-score   support

           0       0.64      0.65      0.64    163568
           1       0.57      0.56      0.57    138046

    accuracy                           0.61    301614
   macro avg       0.60      0.60      0.60    301614
weighted avg       0.61      0.61      0.61    301614



In [None]:
# x = one_hotted_df[['x', 'y', 'distance', 'time_remaining']]
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
# # no scale
# x_train_scaled = x_train
# x_test_scaled = x_test

In [None]:
# Build the neural network
model = Sequential([
    # Input layer
    Input(shape=(x_train_scaled.shape[1],))
])

model.add(layers.Dense(64, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.3))

model.add(layers.Dense(32, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.3))

# Binary classification output layer
model.add(layers.Dense(1, activation='sigmoid'))

# Default learning rate is 1e-3 (.001)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

model.fit(x_train_scaled, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate accuracy
test_loss, test_accuracy = model.evaluate(x_test_scaled, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

Epoch 1/5
[1m30162/30162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 3ms/step - accuracy: 0.6068 - loss: 0.6673 - val_accuracy: 0.6293 - val_loss: 0.6449
Epoch 2/5
[1m30162/30162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 3ms/step - accuracy: 0.6294 - loss: 0.6495 - val_accuracy: 0.6305 - val_loss: 0.6437
Epoch 3/5
[1m30162/30162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 3ms/step - accuracy: 0.6297 - loss: 0.6486 - val_accuracy: 0.6302 - val_loss: 0.6435
Epoch 4/5
[1m30162/30162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 4ms/step - accuracy: 0.6296 - loss: 0.6485 - val_accuracy: 0.6302 - val_loss: 0.6433
Epoch 5/5
[1m30162/30162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 3ms/step - accuracy: 0.6295 - loss: 0.6484 - val_accuracy: 0.6308 - val_loss: 0.6430
[1m9426/9426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1ms/step - accuracy: 0.6340 - loss: 0.6410
Test Accuracy: 63.40%
