In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import sys
import os

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

Notes:
* Going to take the top 75 scoring players in the leage
    - in 1995-96 season, there were only 92 players in the leage
    - lowest ranked scorer that won the mvp was Steve Nash in 2005 where he averaged 15.5 PPG (63rd in the leage)
* csv files are in the directory of the 2nd year in the season. ex. the 95-96 season will be in the 1996 directory
* in the directory for the year, there is a players.csv file and a teams.csv file
    - the teams.csv file is so that I can associate a record with the players
* the mvp.csv dataset is in the outer data dir and has the name of each mvp winner since the 65-66 season


**GET DATA SAMPLE**

In [2]:
# get the data
def get_data(year):
    players = pd.read_csv("data/" + year + "/players.csv")
    return players

In [19]:
# transform the player data into something uniform that we can use
def get_top_75_filtered(players):
    
    # remove entries for players that played on multiple teams, just look at the last team they played for
    players_filtered = players.drop_duplicates(subset='Player', keep='last').reset_index()
    
    # sort by points per game and take top 75 candidates
    players_sorted = players_filtered.sort_values(by="PTS", ascending=False)
    top_75 = players_sorted.head(75).copy()
    
    # metrics we are using
    metrics = ["Player", "Rk", "Age", "Team", "Pos", "G", "GS", "MP", "FG", "FGA", "FG%", "3P", "3P%", "2P", "2PA", "2P%", "eFG%", "FT", "FTA", "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS"]
    
    # these attributes didn't exist in 1956, so we have to create them
    # ['GS', '3P', '3P%', '2P', '2PA', '2P%', 'eFG%', 'ORB', 'DRB', 'STL', 'BLK', 'TOV']
    
    # assume they started every game they played
    if 'GS' not in top_75.columns:
        top_75['GS'] = top_75['G']
    
    # if data came before 3 pointers were invented, they only shot 2 pointers
    if '3P' not in top_75.columns:
        top_75['3P'] = 0
        top_75['3P%'] = 0
        top_75['2P'] = top_75['FG']
        top_75['2PA'] = top_75['FGA']
        top_75['2P%'] = top_75['FG%']
        top_75['eFG%'] = top_75['FG%']
    
    # if they didn't track offensive/deffensive rebounds, assume 3/4 of the rebounds were on defense
    if 'ORB' not in top_75.columns or 'DRB' not in top_75.columns:
        top_75['ORB'] = top_75['TRB']/4
        top_75['DRB'] = top_75['TRB'] - top_75['ORB']
    
    # assume steals, blocks, and turnovers are all 0 unless specified
    if 'STL' not in top_75.columns:
        top_75['STL'] = 0
    
    if 'BLK' not in top_75.columns:
        top_75['BLK'] = 0
    
    if 'TOV' not in top_75.columns:
        top_75['TOV'] = 0
    
    top_75_filtered = top_75[metrics]
    top_75_filtered.head()
    return top_75_filtered


In [20]:
# get the team data and turn it into something usable

def get_team_data(year):
    teams_vs = pd.read_csv("data/" + year + "/teams.csv")
    teams_vs = teams_vs.drop(columns=['Rk'])
    
    teams = pd.DataFrame(columns=['Team', 'Abbreviation', 'Wins', 'Losses', 'Win%'])
    
    # Get abbreviation and record
    for index, row in teams_vs.iterrows():
    
        # extract wins and losses
        row_wins = 0
        row_losses = 0
        for record in row:
            if pd.notna(record) and len(record) < 6:
                win_loss = record.split('-')
                row_wins += int(win_loss[0])
                row_losses += int(win_loss[1])
    
        # print("The " + row['Team'] + " have won ", row_wins, " and lost ", row_losses, " games")
    
        # don't divide by 0
        if row_losses == 0:
            win_pct = 1
        else:
            win_pct = row_wins/(row_losses+row_wins)
    
        # add to df    
        teams.loc[len(teams)] = ({'Team': row['Team'], 'Abbreviation': row.index[row.name + 1], 'Wins': row_wins, 'Losses': row_losses, 'Win%': win_pct})
    
    # get overall team rank in the nba
    teams = teams.sort_values(by='Win%', ascending=False)
    teams = teams.reset_index(drop=True)
    teams['Rank'] = teams.index+1
    
    return teams

# get_team_data("2025")

In [21]:
# combine into final dataframe
def merge_data(top_75_filtered, teams):
    player_season_final_df = pd.merge(top_75_filtered, teams, left_on='Team', right_on='Abbreviation')
    # print(player_season_final_df.head())
    return player_season_final_df

In [22]:
# verify that all teams matched, this should print an empty dataframe
def verify_team_match(top_75_filtered, teams):
    unmatched_teams = top_75_filtered[~top_75_filtered['Team'].isin(teams['Abbreviation'])]
    if len(unmatched_teams['Team']) > 0:
        print(unmatched_teams)
        return 1
    return 0
        

In [23]:
# only keep numerical variables, don't worry about player name, team name, etc
def get_numerical_df(player_season_final_df):
    numerical_df = player_season_final_df.select_dtypes(include=['number'])
    return numerical_df

In [24]:
def get_mvp_by_year(year):
    mvps = pd.read_csv("data/mvps.csv")
    result = mvps.loc[mvps['Season'].str[:4].astype(int) == int(year) - 1, 'Player'].values

    if(len(result) == 1):
        return result[0]
    else:
        print("found ", len(result), " MVPs in year " + year)

In [25]:
def get_final_df_by_year(year):
    # get the data
    og_data = get_data(year)
    top_75 = get_top_75_filtered(og_data)
    team_data = get_team_data(year)

    # merge it
    player_team_combined = merge_data(top_75, team_data)

    # add year and MVP attribute
    player_team_combined['Year'] = int(year)

    # mvp = get_mvp_by
    player_team_combined['MVP'] = (player_team_combined['Player'] == get_mvp_by_year(year)).astype(int)    

    # verify merge worked correctly
    if verify_team_match(top_75, team_data) > 0:
        print("Team merge did not work correctly. Aborting")
        sys.exit()

    # print(player_team_combined.head())

    # get only numeric variables
    numerical_df = get_numerical_df(player_team_combined)
    return numerical_df

# get_final_df_by_year("2025")

In [26]:
def get_all_years_arr():
    scaler = StandardScaler()
    
    all_years_df = []
    # get all the data from all the years
    for year_dir in os.listdir("data"):

        #skip current year
        if year_dir == "2025":
            continue
            
        # verify that the item is a directory and not a file. the mvps.csv file will be ignored
        if os.path.isdir(os.path.join("data", year_dir)):
            year_df = get_final_df_by_year(year_dir)
            # scaled_year = scaler.fit_transform(year_df)
            # scaled_year_df = pd.DataFrame(scaled_year, columns=year_df.columns)
            all_years_df.append(year_df)

    # all_data = pd.concat(all_years_df, ignore_index=True)
    
    # print(all_years_df)
    return np.array(all_years_df)
       
    


**MAKE MODEL**

In [45]:
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.InputLayer(input_shape=[75, 31]))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(300, activation="relu"))
model.add(tf.keras.layers.Dense(100, activation="relu"))
model.add(tf.keras.layers.Dense(75, activation="softmax"))



In [46]:
model.summary()

In [47]:
model.compile(loss="categorical_crossentropy",
              optimizer="sgd",
              metrics=["binary_accuracy"])

**TRAIN MODEL**

In [48]:
# get data for model
all_years_arr = get_all_years_arr()

#split into training/validation
train_arr, val_arr = train_test_split(all_years_arr, test_size=0.1, random_state=42)

X_train = train_arr[:, :, :-1]
y_train = train_arr[:, :, -1]
X_val = val_arr[:, :, :-1]
y_val = val_arr[:, :, -1]

# print(X_train.shape)
# print(y_train.shape)
# print(X_val)
# print(y_val)





In [None]:
history = model.fit(X_train, y_train, epochs=250, batch_size=1, validation_data=(X_val, y_val))

Epoch 1/250
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - binary_accuracy: 0.9867 - loss: 4.3129 - val_binary_accuracy: 0.9867 - val_loss: 4.3207
Epoch 2/250
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - binary_accuracy: 0.9867 - loss: 4.2807 - val_binary_accuracy: 0.9867 - val_loss: 4.3241
Epoch 3/250
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - binary_accuracy: 0.9867 - loss: 4.2487 - val_binary_accuracy: 0.9867 - val_loss: 4.3276
Epoch 4/250
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.9867 - loss: 4.2170 - val_binary_accuracy: 0.9867 - val_loss: 4.3312
Epoch 5/250
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.9867 - loss: 4.1855 - val_binary_accuracy: 0.9867 - val_loss: 4.3349
Epoch 6/250
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - binary_accuracy: 0.9867 - loss: 4.154

In [None]:
predict_yr = "2025"

data = get_final_df_by_year(predict_yr).to_numpy()[:, :-1]
shaped_data = data.reshape(1, 75, 31)
prediction_arr = model.predict(shaped_data)
mvp_idx = prediction_arr[0].argmax()
top_10_indices = prediction_arr[0].argsort()[-10:][::-1]
# print(top_10_indices)
# print(get_top_75_filtered(get_data(predict_yr)).head(10))

mvp = get_top_75_filtered(get_data(predict_yr))['Player'][mvp_idx]
top_10_mvps = get_top_75_filtered(get_data(predict_yr))['Player'][top_10_indices]
print(mvp)
print(top_10_mvps)
print(prediction_arr[0])
