# Project Model Overview

## Steps in Our Model

---
* Normalize data across seasons
* Find 10 most similar player seasons historically
* Rank and weight each of those 10 players season stats
* Look at 10 players following seasons stats
* Use weighted averages to predict current players next season
* Rinse and repeat for every player in 2017-18
---

# Cleaning Our NBA Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # matplotlib along with seaborne plotting graphs in JN.
%matplotlib inline

In [None]:
stats = [
    'pts',
    'min',
    'fgm',
    'fga',
    'fg3m',
    'fg3a',
    'ftm',
    'fta',
    'oreb',
    'dreb',
    'ast',
    'stl',
    'tov',
    'blk'
]

In [None]:
# Using Pandas we read in per game data from csv folder.
df = pd.read_csv('../nba-stats-csv/player_general_traditional_per_game_data.csv', header = 0)

In [None]:
df.head(10)

In [None]:
df.tail(10)

In [None]:
df.sample(10)

In [None]:
list(df)

In [None]:
df_cleaned = df.dropna(how='all') # delete all rows in which each col contains NaN
# 2nd param: axis=1 -> delete all columns in which each row contains NaN

In [None]:
df_cleaned['gp'].describe()

In [None]:
min_gp = df_cleaned['gp'].mean() - (df_cleaned['gp'].std() * 3)

In [None]:
min_gp

In [None]:
bin_values = np.arange(start=0, stop=82, step=2)
df_cleaned['gp'].hist(bins=bin_values, figsize=[14,6])
# Histogram of games played in our data frame 

In [None]:
min_gp = 10
df_filter = df_cleaned[df_cleaned['gp'] > min_gp]

In [None]:
x = df_cleaned['player_id'].count()
y = df_filter['player_id'].count()
print(x-y)

# Normalizing Data Across Seasons

In [None]:
season = df_filter['season_id'] == '2017-18'

In [None]:
df_2018 = df_filter[season]

In [None]:
df_2018.sample(10)

In [None]:
df_2018['pts_normalized'] = (df_2018['pts'] - df_2018['pts'].min()) / (df_2018['pts'].max() - df_2018['pts'].min())

In [None]:
df_2018.sort_values('pts_normalized', ascending=False).head(10)

In [None]:
# We can find each player by his id in: https://www.nba.com/stats/player/ + [player_id]

In [None]:
def normalize(col):
    #df_2018['pts_normalized'] = (df_2018['pts'] - df_2018['pts'].min()) / (df_2018['pts'].max() - df_2018['pts'].min())
    #df_2018['fgm_normalized'] = (df_2018['fgm'] - df_2018['fgm'].min()) / (df_2018['fgm'].max() - df_2018['fgm'].min())
    #df_2018['ast_normalized'] = (df_2018['ast'] - df_2018['ast'].min()) / (df_2018['ast'].max() - df_2018['ast'].min())
    return (col - col.min()) / (col.max() - col.min())

In [None]:
normalize(df_2018['pts'])

In [None]:
normalize(df_2018['fgm'])

In [None]:
normalize(df_2018['ast'])

In [None]:
cols_to_normalize = stats

In [None]:
def vorp(df): # Value Of Replacement Player
    for col_name in cols_to_normalize:
        df.loc[:, f"{col_name}_normalize"] = normalize(df[col_name])
    return df

In [None]:
df_2018_normalized = vorp(df_2018)

In [None]:
df_2018_normalized.sample(10)

In [None]:
grouped = df_filter.groupby('season_id')
grouped['pts'].mean()

In [None]:
df_new = df_filter.groupby(['season_id']).apply(vorp)

In [None]:
df_new.sample(10)

# Player Distance Function

In [None]:
def calc_distance(u, v):
    distance = np.sqrt(np.sum((u - v)**2))
    return distance

In [None]:
df1 = pd.read_csv('../nba-stats-csv/player_id_player_name.csv')

In [None]:
df2 = pd.read_csv('../nba-stats-csv/player_general_traditional_per_game_data.csv', usecols=['player_id', 'season_id', 'pts'])

In [None]:
df1.sample(5)

In [None]:
df2.sample(5)

In [None]:
df3 = pd.merge(df1, df2, on='player_id', how='left')

In [None]:
df3.sample(5)

In [None]:
df_2019 = df3[df3['season_id'] == '2018-19']

In [None]:
df_2019.sample(10)

In [None]:
dame_df = df_2019[df_2019['player_name'] == 'Damian Lillard']
steph_df = df_2019[df_2019['player_name'] == 'Stephen Curry']
rudy_df = df_2019[df_2019['player_name'] == 'Rudy Gobert']

In [None]:
dame_ppg = dame_df.pts.tolist()[0]
steph_ppg = steph_df.pts.tolist()[0]
rudy_ppg = rudy_df.pts.tolist()[0]

In [None]:
calc_distance(dame_ppg, steph_ppg) # the order of params don't change the result

In [None]:
calc_distance(dame_ppg, rudy_ppg)

In [None]:
calc_distance(steph_ppg, rudy_ppg)

# Find Player Function

In [None]:
nba_stats = {
    'first_name': ['Jaylen','Jayson','Jrue','Derrick','Al'],
    'last_name': ['Brown','Tatum','Holiday','White','Horford'],
    'ppg': [28.2, 30.3, 14.1, 12.5, 7.8],
    'apg': [5.2, 6.1, 4.4, 4.5, 1.5],
    'rpg': [6.0, 8.4, 4.9, 3.7, 6.1]
}
celtics_df = pd.DataFrame(nba_stats, columns = ['first_name','last_name','ppg','apg','rpg'])

In [None]:
for row in celtics_df.itertuples():
    print(row.first_name, row.last_name, row.ppg)

In [None]:
# read in per game data form csv folder
dataframe = pd.read_csv('../nba-stats-csv/player_general_traditional_per_game_data.csv', header = 0)

In [None]:
df_example = dataframe.sample(10)

In [None]:
for row in df_example.itertuples():
    print(row.season_id, row.player_id)

In [None]:
def find_player(player_id, player_season):
    for row in df_example.itertuples():
        if player_season == row.season_id and player_id == row.player_id:
            return row

In [None]:
df_example

In [None]:
find_player(1504, '2001-02')

In [None]:
def find_player(df, player_id, player_season):
    for row in df.itertuples():
        if player_season == row.season_id and player_id == row.player_id:
            return row

In [None]:
dataframe.dtypes # show the type of each field in the DF

In [None]:
player = 2544 # Lebron James
season = '2012-13'

In [None]:
find_player(dataframe, player, season)

# Calculating Player Similarity

In [None]:
current_player_season = '2016-17'
current_player_id = 201950

In [None]:
# read in per game data from csv folder
df = pd.read_csv('../nba-stats-csv/player_general_traditional_per_game_data.csv', header = 0)

In [None]:
df_cleaned = df.dropna(how='all')

In [None]:
min_gp = 10
df_filter = df_cleaned[df_cleaned['gp'] > min_gp]

In [None]:
cols_to_normalize = [
    'pts',
    'min',
    'fgm',
    'fga',
    'fg3m',
    'fg3a',
    'ftm',
    'fta',
    'oreb',
    'dreb',
    'ast',
    'stl',
    'tov',
    'blk'
]

In [None]:
def vorp(df):
    for col_name in cols_to_normalize:
        df['{}_norm'.format(col_name)] = normalize(df[col_name])
    return df

In [None]:
df_norm = df_filter.groupby(['season_id']).apply(vorp)

In [None]:
jrue_pts_norm = (df_norm.loc[(df_norm['player_id'] == current_player_id) & (df_norm['season_id'] == current_player_season), 'pts_norm']).item()

In [None]:
jrue_pts_norm

In [None]:
# We normalize only 3 fields but we can normalize all fields we need inside the array
jrue_player_stats = [
    (df_norm.loc[(df_norm['player_id'] == current_player_id) & (df_norm['season_id'] == current_player_season), 'pts_norm']).item(),
    (df_norm.loc[(df_norm['player_id'] == current_player_id) & (df_norm['season_id'] == current_player_season), 'min_norm']).item(),
    (df_norm.loc[(df_norm['player_id'] == current_player_id) & (df_norm['season_id'] == current_player_season), 'ast_norm']).item()
]

In [None]:
print(jrue_player_stats)

In [None]:
current_player_vector = np.array(jrue_player_stats)

In [None]:
current_player_vector

In [None]:
compared_player_season = '2013-14'
compared_player_id = 203077

In [None]:
mkg_player_stats = [
    (df_norm.loc[(df_norm['player_id'] == compared_player_id) & (df_norm['season_id'] == compared_player_season), 'pts_norm']).item(),
    (df_norm.loc[(df_norm['player_id'] == compared_player_id) & (df_norm['season_id'] == compared_player_season), 'min_norm']).item(),
    (df_norm.loc[(df_norm['player_id'] == compared_player_id) & (df_norm['season_id'] == compared_player_season), 'ast_norm']).item()
]

In [None]:
compared_player_vector = np.array(mkg_player_stats)

In [None]:
compared_player_vector

In [None]:
vfunc = np.vectorize(calc_distance)

In [None]:
distance_vect = vfunc(current_player_vector, compared_player_vector)

In [None]:
distance_vect # We obtained the distance between each field for our 2 example players.
# [a1 b1] - [a2 b2] = [(a1-a2), (b1-b2), (c1-c2), (d1-d2)]
# [c1 d1]   [c2 d2]

In [None]:
number = np.sum(np.abs(distance_vect) / len(distance_vect))

In [None]:
number # This number measures the similarity in these 3 fields [pts,min,ast] between jrue and mkg.

# Comparing Multiple Players in a For Loop

In [None]:
player_distance = []

In [None]:
df_sample = df_norm.sample(10)

In [None]:
df_sample.head(10)

In [None]:
# We will nest all rows from our previous example
for row in df_sample.itertuples():
    compared_player_vector = np.array([
        row.pts_norm,
        row.min_norm,
        row.ast_norm
    ])
    
    vfunc = np.vectorize(calc_distance)
    distance_vect = vfunc(current_player_vector, compared_player_vector)
    number = np.sum(np.abs(distance_vect)) / len(distance_vect)
    player_distance.append(number)
    player = row.player_id
    print('Done with ' + str(player) + '. Percent error was ' + str(round(1 - number, 1)))

In [None]:
player_distance

In [None]:
df_sample['distance'] = player_distance # the shorter the distance the better

In [None]:
ranked_df = df_sample.sort_values('distance')

In [None]:
ranked_df.head(10)

In [None]:
import pandas as pd

In [None]:
df1 = pd.read_csv('../nba-stats-csv/player_info.csv')

In [None]:
df_final = pd.merge(ranked_df, df1, on=['player_id'], how='left').drop_duplicates()

In [None]:
df_final = df_final[['player_name', 'player_id', 'distance']]

In [None]:
df_final

# Weighting Stat Projections

In [None]:
season_list = [
    '1996-97',
    '1997-98',
    '1998-99',
    '1999-00',
    '2000-01',
    '2001-02',
    '2002-03',
    '2003-04',
    '2004-05',
    '2005-06',
    '2006-07',
    '2007-08',
    '2008-09',
    '2009-10',
    '2010-11',
    '2011-12',
    '2012-13',
    '2013-14',
    '2014-15',
    '2015-16',
    '2016-17',
    '2017-18',
    '2018-19'
]

In [None]:
season_id = season_list[18]
print(season_id)

In [None]:
season_id = '2014-15'
value = season_list.index(season_id)
print(value)

In [None]:
current_season = season_list[(season_list.index(season_id))]
print(current_season)

In [None]:
next_season = season_list[(season_list.index(season_id) + 1)]
print(next_season)

In [None]:
ranked_df.head(10)

In [None]:
print(current_player_id)
print(current_player_season)

In [None]:
test_df = ranked_df.iloc[0]

In [None]:
weight = (1 / test_df.distance)
print(weight)

In [None]:
stats = [
    'pts',
    'min',
    'fgm',
    'fga',
    'fg3m',
    'fg3a',
    'ftm',
    'fta',
    'oreb',
    'dreb',
    'ast',
    'stl',
    'tov',
    'blk'
]

In [None]:
projected_stats = {}

In [None]:
for col in stats:
    sum_stat = test_df[col] * weight
    projected_stats['proj_' + col] = (sum_stat / weight)

In [None]:
projected_stats

In [None]:
projected_stats = {}

In [None]:
sum_stat = 0
sum_weight = 0

In [None]:
for row in ranked_df.itertuples():
    weight = (1 / row.distance)
    sum_stat += row.pts + weight
    sum_weight += weight

In [None]:
proj_points = (sum_stat / sum_weight)

In [None]:
proj_points

In [None]:
for col in stats:
    sum_stat = 0
    sum_weight = 0
    for index, row in ranked_df.iterrows():
        weight = (1 / row.distance)
        sum_stat += row[col] + weight
        sum_weight += weight
    projected_stats['player_id'] = current_player_id
    projected_stats['proj_season_id'] = season_list[(season_list.index(current_player_season) + 1)]
    projected_stats['proj_' + col] = (sum_stat / sum_weight)

In [None]:
projected_stats # Stats are in a dictionary but we can convert it into a DF

# Weighted Average Using Multiple Players Next Season

In [None]:
for col in stats:
    sum_stat = 0
    sum_weight = 0
    for index, row in ranked_df.iloc[1:11].iterrows():
        if row.season_id == '2017-18':
            continue
        weight = (1 / row.distance)
        next_season = season_list[(season_list.index(row.season_id))] # + 1
        player_next_season = find_player(ranked_df, row.player_id, next_season)
        if player_next_season == None:
            continue
        sum_stat += getattr(player_next_season, col) * weight
        sum_weight += weight
    projected_stats['player_id'] = current_player_id
    projected_stats['proj_season_id'] = season_list[(season_list.index(current_player_season))]

# Player Comparison Tool Function

In [None]:
# Let's create the main event

In [None]:
def player_comparison_tool(df, current_player_season, current_player_id):
    if (((df['season_id'] == current_player_season) & (df['player_id'] == current_player_id)).any() == False):
        print('Can\'t find player with id {} and season {}'.format(current_player_id, current_player_season))
        return
    for row in df.itertuples():
        if current_player_season == row.season_id and current_player_id == row.player_id:
            current_player_id = row.player_id
            break

    current_player_vector = np.array([
    (df_norm.loc[(df_norm['player_id'] == compared_player_id) & (df_norm['season_id'] == compared_player_season), 'pts_norm']).item(),
    (df_norm.loc[(df_norm['player_id'] == compared_player_id) & (df_norm['season_id'] == compared_player_season), 'min_norm']).item(),
    (df_norm.loc[(df_norm['player_id'] == compared_player_id) & (df_norm['season_id'] == compared_player_season), 'fgm_norm']).item(),
    (df_norm.loc[(df_norm['player_id'] == compared_player_id) & (df_norm['season_id'] == compared_player_season), 'fga_norm']).item(),
    (df_norm.loc[(df_norm['player_id'] == compared_player_id) & (df_norm['season_id'] == compared_player_season), 'fg3m_norm']).item(),
    (df_norm.loc[(df_norm['player_id'] == compared_player_id) & (df_norm['season_id'] == compared_player_season), 'fg3a_norm']).item(),
    (df_norm.loc[(df_norm['player_id'] == compared_player_id) & (df_norm['season_id'] == compared_player_season), 'ftm_norm']).item(),
    (df_norm.loc[(df_norm['player_id'] == compared_player_id) & (df_norm['season_id'] == compared_player_season), 'fta_norm']).item(),
    (df_norm.loc[(df_norm['player_id'] == compared_player_id) & (df_norm['season_id'] == compared_player_season), 'oreb_norm']).item(),
    (df_norm.loc[(df_norm['player_id'] == compared_player_id) & (df_norm['season_id'] == compared_player_season), 'dreb_norm']).item(),
    (df_norm.loc[(df_norm['player_id'] == compared_player_id) & (df_norm['season_id'] == compared_player_season), 'ast_norm']).item(),
    (df_norm.loc[(df_norm['player_id'] == compared_player_id) & (df_norm['season_id'] == compared_player_season), 'stl_norm']).item(),
    (df_norm.loc[(df_norm['player_id'] == compared_player_id) & (df_norm['season_id'] == compared_player_season), 'tov_norm']).item(),
    (df_norm.loc[(df_norm['player_id'] == compared_player_id) & (df_norm['season_id'] == compared_player_season), 'blk_norm']).item()
    ])
    
    print('Projecting player_id {0} for season {1}').format(current_player_id, season_list[(season_list.index(row.season_id) +1)])

    player_distance = []

    #weighted_numbers = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]

    for row in df.itertuples():
        compared_player_vector = np.array([
            row.pts_norm,
            row.min_norm,
            row.fgm_norm,
            row.fga_norm,
            row.fg3m_norm,
            row.fg3a_norm,
            row.ftm_norm,
            row.fta_norm,
            row.oreb_norm,
            row.dreb_norm,
            row.ast_norm,
            row.stl_norm,
            row.tov_norm,
            row.blk_norm
        ])

        vfunc = np.vectorize(calc_distance)
        distance_vect = vfunc(current_player_vector, compared_player_vector)

        #weighted_distance = distance_vect * weighted_numbers
        #number = np.sum(np.abs(weighted_distance)) / len(distance_vect)
        
        number = np.sum(np.abs(distance_vect)) / len(distance_vect)
        player_distance.append(number)

    df['distance'] = player_distance

    ranked_df = df.sort_values('distance')

    stats = [
    'pts',
    'min',
    'fgm',
    'fga',
    'fg3m',
    'fg3a',
    'ftm',
    'fta',
    'oreb',
    'dreb',
    'ast',
    'stl',
    'tov',
    'blk'
    ]

    projected_stats = {}

    for col in stats:
        sum_stat = 0
        sum_weight = 0
        for index, row in ranked_df.iloc[1:11].iterrows():
            # skip over the row if it was 2017-18 season because we cant take the next
            if row.season_id == '2017-18':
                continue
            # get the plaayers next season
            weight = (1 / row.distance)
            next_season = season_list[(season_list.index(current_player_season) + 1)]
            # find the player row with the id and the next season
            player_next_season = find_player(ranked_df, row.player_id, next_season)
            # if player_next_season doesn't exist then skip
            if player_next_season == None:
                continue
            sum_stat = getattr(player_next_season, col) * weight
            sum_weight += weight
        projected_stats['player_id'] = current_player_id
        projected_stats['proj_season_id'] = season_list[(season_list.index(current_player_season) + 1)]
        projected_stats['proj_' + col] = (sum_stat / sum_weight)
    
    return project_stats

# Projecting 2018-19 Season Stats

In [None]:
ten_players = [
    201935,
    203081,
    201942,
    201937,
    202339,
    203496,
    203497,
    201567,
    202331,
    202691
]

In [None]:
current_player_season = '2017-18'

In [None]:
final_projections = []

In [None]:
# read in per game data from csv folder
df = pd.read_csv('../nba-stats-csv/player_general_traditional_per_game_data.csv', header=0)

In [None]:
df_new = df[df.season_id != '2018-19']

In [None]:
df_new.sample(10)

In [None]:
df_cleaned = df_new.dropna(how='any')

In [None]:
min_gp = 10
df_filter = df_cleaned[df_cleaned['gp'] > min_gp]

In [None]:
df_final = df_filter.groupby(['season_id']).apply(vorp)

In [None]:
df_final.tail(5)

In [None]:
season_list = [
    '1996-97',
    '1997-98',
    '1998-99',
    '1999-00',
    '2000-01',
    '2001-02',
    '2002-03',
    '2003-04',
    '2004-05',
    '2005-06',
    '2006-07',
    '2007-08',
    '2008-09',
    '2009-10',
    '2010-11',
    '2011-12',
    '2012-13',
    '2013-14',
    '2014-15',
    '2015-16',
    '2016-17',
    '2017-18',
    '2018-19'
]

In [None]:
#function to calculate distance between two points
def calc_distance(u,v):
    dist = np.sqrt(np.sum((u-v)**2))
    return dist

#create a function to find the player an the next season
def find_player(df, player_id, season):
    # replaces for loop
    for row in df.itertuples():
        if season == row.season_id and player_id == row.player_id:
            return row

In [None]:
for baller_id in ten_players:
    current_player_id = baller_id
    current_player_season = '2017-18'
    projections = player_comparison_tool(df_final, current_player_season, current_player_id)
    if (projections == None):
        continue
        
    final_projections.append(projections)

In [None]:
# **This will be the list where we storage our projections**
final_projections = []

In [None]:
test_projections = pd.DataFrame(final_projections)

In [None]:
test_projections.sample(5)

In [None]:
df_names = pd.read_csv('../nba-stats-csv/player_id_player_name.csv')

In [None]:
final_stat_df = pd.merge(test_projections, df_names, left_on=['player_id'], right_on=['player_id'], how='inner')

In [None]:
round(final_stat_df, 1)

In [None]:
player_df = pd.read_csv('../nba-stats-csv/player_info.csv')

In [None]:
player_filter = player_df[player_df['season_id'] == '2017-18']

In [None]:
player_ids_2018 = player_filter['player_id'].tolist()

In [None]:
player_ids_2018

In [None]:
for baller_id in player_ids_2018:
    current_player_id = baller_id
    current_player_season = '2017-18'
    projections = player_comparison_tool(df_final, current_player_season, current_player_id)
    if (projections == None):
       continue
    final_projections.append(projections)

In [None]:
player_info_columns = [
    'player_name',
    'player_id',
    'proj_season_id'
]

In [None]:
player_info = final_projections[player_info_columns]


In [None]:
# Finally we're gonna save our final projections to CSV
final_projections.to_csv('../nba-stats-csv/player_proj_df.csv', index=False)
player_info.to_csv('../nba-stats-csv/player_info_df.csv', index=False)