In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import sys
import os

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras import regularizers
from sklearn.decomposition import PCA

Notes:
* Going to take the top 75 scoring players in the leage
    - in 1995-96 season, there were only 92 players in the leage
    - lowest ranked scorer that won the mvp was Steve Nash in 2005 where he averaged 15.5 PPG (63rd in the leage)
* csv files are in the directory of the 2nd year in the season. ex. the 95-96 season will be in the 1996 directory
* in the directory for the year, there is a players.csv file and a teams.csv file
    - the teams.csv file is so that I can associate a team record with the players
    - advanced data only goes back until the 1980 season
* the mvp.csv dataset is in the outer data dir and has the name of each mvp winner since the 65-66 season


**GET DATA SAMPLE**

In [2]:
# get standard player data 
def get_data(year):
    players = pd.read_csv("data/" + year + "/players.csv")
    unique_rows = players.groupby("Player").filter(lambda x: len(x) == 1)
    return unique_rows

In [3]:
# get standard player data 
def get_advanced_data(year):
    advanced = pd.read_csv("data/" + year + "/advanced.csv")
    unique_rows = advanced.groupby("Player").filter(lambda x: len(x) == 1)
    
    metrics = [ "Player", "PER", "TS%", "3PAr", "FTr", "ORB%", "DRB%", "TRB%", "AST%", "STL%", "BLK%", "TOV%", "USG%", "OWS", "DWS", "WS", "WS/48", "OBPM", "DBPM", "BPM", "VORP"]
    advanced_filtered = unique_rows[metrics]
    
    return advanced_filtered

In [4]:
# get the team data and turn it into something usable

def get_team_data(year):
    teams_vs = pd.read_csv("data/" + year + "/teams.csv")
    teams_vs = teams_vs.drop(columns=['Rk'])
    
    teams = pd.DataFrame(columns=['Team', 'Abbreviation', 'Wins', 'Losses', 'Win%'])
    
    # Get abbreviation and record
    for index, row in teams_vs.iterrows():
    
        # extract wins and losses
        row_wins = 0
        row_losses = 0
        for record in row:
            if pd.notna(record) and len(record) < 6:
                win_loss = record.split('-')
                row_wins += int(win_loss[0])
                row_losses += int(win_loss[1])
    
        # don't divide by 0
        if row_losses == 0:
            win_pct = 1
        else:
            win_pct = row_wins/(row_losses+row_wins)
    
        # add to df    
        teams.loc[len(teams)] = ({'Team': row['Team'], 'Abbreviation': row.index[row.name + 1], 'Wins': row_wins, 'Losses': row_losses, 'Win%': win_pct})
    
    # get overall team rank in the nba
    teams = teams.sort_values(by='Win%', ascending=False)
    teams = teams.reset_index(drop=True)
    teams['Team_Rank'] = teams.index+1
    
    return teams

# get_team_data("2025")

In [5]:
# transform the player data into something uniform that we can use
def get_top_75_filtered(players):
    
    # remove entries for players that played on multiple teams, just look at the last team they played for
    players_filtered = players.drop_duplicates(subset='Player', keep='last').reset_index()
    
    # sort by points per game and take top 75 candidates
    players_sorted = players_filtered.sort_values(by="PTS", ascending=False)
    top_75 = players_sorted.head(75).copy()
    
    # metrics we are using
    metrics = ["Player", "Age", "Team", "Pos", "G", "GS", "MP", "FG", "FGA", "FG%", "3P", "3P%", "2P", "2PA", "2P%", "eFG%", "FT", "FTA", "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS"]
    top_75_filtered = top_75[metrics]
    return top_75_filtered


In [6]:
# combine into final dataframe
def merge_data(top_75_filtered, teams, advanced):
    player_team_df = pd.merge(top_75_filtered, teams, left_on='Team', right_on='Abbreviation')
    player_team_advanced_df = pd.merge(player_team_df, advanced, left_on='Player', right_on='Player')
    return player_team_advanced_df

In [7]:
# verify that all teams matched, this should print an empty dataframe
def verify_team_match(top_75_filtered, teams):
    unmatched_teams = top_75_filtered[~top_75_filtered['Team_x'].isin(teams['Abbreviation'])]
    if len(unmatched_teams['Team_x']) > 0:
        print(unmatched_teams)
        return 1
    return 0
        

In [8]:
# verify that all advanced stats matched with a player, this should print an empty dataframe
def verify_player_match(top_75_filtered, players):
    unmatched_players = top_75_filtered[~top_75_filtered['Player'].isin(players['Player'])]
    if len(unmatched_players['Player']) > 0:
        print(unmatched_players)
        return 1
    return 0

In [9]:
# only keep numerical variables, don't worry about player name, team name, etc
def get_numerical_df(player_season_final_df):
    numerical_df = player_season_final_df.select_dtypes(include=['number'])
    return numerical_df

In [10]:
def get_mvp_by_year(year):
    mvps = pd.read_csv("data/mvps.csv")
    result = mvps.loc[mvps['Season'].str[:4].astype(int) == int(year) - 1, 'Player'].values

    if(len(result) == 1):
        return result[0]
    else:
        print("found ", len(result), " MVPs in year " + year)

In [11]:
def get_final_df_by_year(year):
    # get the data
    og_data = get_data(year)
    top_75 = get_top_75_filtered(og_data)
    team_data = get_team_data(year)
    advanced_data = get_advanced_data(year)

    # merge it
    player_team_advanced_combined = merge_data(top_75, team_data, advanced_data)


    # verify merge worked correctly
    if verify_team_match(player_team_advanced_combined, team_data) > 0:
        print("Team merge did not work correctly. Aborting")
        sys.exit()

    # verify merge worked correctly
    if verify_player_match(player_team_advanced_combined, advanced_data) > 0:
        print("Advanced stats merge did not work correctly. Aborting")
        sys.exit()

    # add year and MVP attribute
    player_team_advanced_combined['Year'] = int(year)
    player_team_advanced_combined['MVP'] = (player_team_advanced_combined['Player'] == get_mvp_by_year(year)).astype(int) 

    # using these metrics makes the model overfit like crazy because the correlate so much with the MVP
    # might want to minimize features through dimentionality reduction?
    # metrics = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3P%', '2P', '2PA', 
    #            '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 
    #            'BLK', 'TOV', 'PF', 'PTS', 'Wins', 'Losses', 'Win%', 'Team_Rank', 'PER', 
    #            'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 
    #            'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Year', 'MVP']

    # these are the metrics that correlate most with MVP wins according to the Data_Sci_Comparison analysis
    metrics = ["BPM", "WS/48", "WS", "OBPM", "OWS", "PTS", "DBPM", "FG", "FTA", "FT", "PER", "Win%", "USG%", "DWS", "Wins", "VORP", "Losses", "Team_Rank", "MVP"]
    
    # using these metrics and a batch size of 1 overfits the training data like crazy
    # metrics = ["PER", "WS", "WS/48", "PTS", "FG%", "AST", "Age", "MP", "TRB", "G", "MVP"]
    
    # get only numeric variables
    numerical_df = get_numerical_df(player_team_advanced_combined[metrics])

    #put in random order
    # shuffled_df = numerical_df.sample(frac=1, random_state=42).reset_index(drop=True)

    return numerical_df

In [12]:
def get_all_years_arr():
    scaler = StandardScaler()
    
    all_years_df = []
    # get all the data from all the years
    for year_dir in os.listdir("data"):

        #skip current year
        if year_dir == "2025" or (year_dir != "mvps.csv" and int(year_dir) < 1980):
            continue
            
        # verify that the item is a directory and not a file. the mvps.csv file will be ignored
        if os.path.isdir(os.path.join("data", year_dir)):
            year_df = get_final_df_by_year(year_dir)
            all_years_df.append(year_df)
            
    print(list(all_years_df[0].columns))

    return np.array(all_years_df)
       
    


**MAKE MODEL**

In [13]:
# get data for model
all_years_arr = get_all_years_arr()

#split into training/validation/test
train_arr, temp_arr = train_test_split(all_years_arr, test_size=0.3, random_state=20)
val_arr, test_arr = train_test_split(temp_arr, test_size=0.5, random_state=20)

# Separate features (X) and labels (y)
X_train = train_arr[:, :, :-1]
y_train = train_arr[:, :, -1]

X_val = val_arr[:, :, :-1]
y_val = val_arr[:, :, -1]

X_test = test_arr[:, :, :-1]
y_test = test_arr[:, :, -1]

['BPM', 'WS/48', 'WS', 'OBPM', 'OWS', 'PTS', 'DBPM', 'FG', 'FTA', 'FT', 'PER', 'Win%', 'USG%', 'DWS', 'Wins', 'VORP', 'Losses', 'Team_Rank', 'MVP']


In [14]:
tf.random.set_seed(42)
model = tf.keras.Sequential()

model.add(tf.keras.layers.InputLayer(shape=[X_train.shape[1], X_train.shape[2]]))

model.add(tf.keras.layers.LayerNormalization(axis=-1))

# Apply TimeDistributed Dense layers to process each player's features independently
model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(20, activation="relu")))
model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(20, activation="relu")))

model.add(tf.keras.layers.Flatten())

# Apply softmax across the 75 players
model.add(tf.keras.layers.Dense(X_train.shape[1], activation="softmax"))


In [15]:
model.summary()

In [16]:
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
model.compile(loss="categorical_crossentropy",
              optimizer=optimizer,
              metrics=["accuracy"])

**TRAIN MODEL**

In [17]:
history = model.fit(X_train, y_train, epochs=100, batch_size=X_test.shape[1], validation_data=(X_val, y_val))

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 21s/step - accuracy: 0.0323 - loss: 4.5988 - val_accuracy: 0.1429 - val_loss: 4.2719
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 183ms/step - accuracy: 0.0323 - loss: 4.3930 - val_accuracy: 0.1429 - val_loss: 4.1011
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step - accuracy: 0.0323 - loss: 4.2136 - val_accuracy: 0.4286 - val_loss: 3.9439
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step - accuracy: 0.2581 - loss: 4.0495 - val_accuracy: 0.2857 - val_loss: 3.7944
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step - accuracy: 0.2581 - loss: 3.8941 - val_accuracy: 0.2857 - val_loss: 3.6504
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step - accuracy: 0.2581 - loss: 3.7448 - val_accuracy: 0.2857 - val_loss: 3.5132
Epoch 7/100
[1m1/1[0m [32m━━━━━━

In [18]:
train_loss, train_accuracy = model.evaluate(X_train, y_train)
print(f"Training data Loss: {train_loss}")
print(f"Training data Accuracy: {train_accuracy}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - accuracy: 0.9677 - loss: 0.6525
Training data Loss: 0.6525429487228394
Training data Accuracy: 0.9677419066429138


In [19]:
val_loss, val_accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step - accuracy: 0.4286 - loss: 2.3746
Validation Loss: 2.374570608139038
Validation Accuracy: 0.4285714328289032


In [20]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - accuracy: 0.1429 - loss: 2.8279
Test Loss: 2.8279426097869873
Test Accuracy: 0.1428571492433548


In [21]:
count = 0
for arr in y_train:
    if arr[0] == 1:
        count+=1
print("The MVP is the highest scorer ", count/len(y_train), " of the time in the training set")

The MVP is the highest scorer  0.25806451612903225  of the time in the training set


In [22]:
count = 0
for arr in y_val:
    if arr[0] == 1:
        count+=1
print("The MVP is the highest scorer ", count/len(y_val), " of the time in the validation set")

The MVP is the highest scorer  0.2857142857142857  of the time in the validation set


In [23]:
count = 0
for arr in y_test:
    if arr[0] == 1:
        count+=1
print("The MVP is the highest scorer ", count/len(y_val), " of the time in the test set")

The MVP is the highest scorer  0.2857142857142857  of the time in the test set


In [24]:
predict_yr = "2025"

data = get_final_df_by_year(predict_yr).to_numpy()[:, :-1]
shaped_data = data.reshape(1, X_train.shape[1], X_train.shape[2])
prediction_arr = model.predict(shaped_data)
mvp_idx = prediction_arr[0].argmax()
top_10_indices = prediction_arr[0].argsort()[-10:][::-1]

mvp = get_top_75_filtered(get_data(predict_yr))['Player'][mvp_idx]
top_10_mvps = get_top_75_filtered(get_data(predict_yr))['Player'][top_10_indices]
print(mvp)
print(top_10_mvps)
print(prediction_arr[0])
# print(sum(prediction_arr[0]))


found  0  MVPs in year 2025
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
Nikola Jokić
2              Nikola Jokić
1               LaMelo Ball
11             Kevin Durant
0     Giannis Antetokounmpo
13            Jalen Brunson
6              Jayson Tatum
19             Franz Wagner
7              De'Aaron Fox
10           Damian Lillard
5            Paolo Banchero
Name: Player, dtype: object
[6.2861770e-02 8.6063743e-02 4.5905432e-01 1.4415603e-02 1.0396108e-02
 2.6947385e-02 5.5630501e-02 3.0245533e-02 8.5296640e-05 1.3311747e-02
 2.9452777e-02 6.4344607e-02 1.3268767e-04 6.2344182e-02 7.4964257e-05
 2.0502963e-05 2.8915241e-05 5.4723464e-06 1.5917799e-04 3.1228583e-02
 1.6985739e-04 9.3657240e-05 2.1813168e-04 1.9226906e-04 7.7982084e-05
 4.6377583e-05 1.1498349e-04 1.7183798e-04 1.1604668e-04 5.4681601e-05
 2.2168227e-02 6.1406572e-05 6.2471932e-05 1.0358116e-03 4.5649551e-05
 1.8612534e-04 1.3775872e-05 1.0394790e-04 1.4028224e-04 6.6450244e-05
 1.3953759e-0