# Using RMSE to Evaluate Our Model

In [None]:
import pandas as pd
import numpy as np

In [None]:
# RMSE = Root Media Square Error

In [None]:
proj_df = pd.read_csv('../nba-stats-csv/player_proj_df.csv')
proj_df.head()

In [None]:
proj_df['squared_error'] = (proj_df['proj_pts'] - proj_df['pts'])**(2) 

In [None]:
mse = proj_df['squared_error'].mean()  # MEAN SQUARED ERROR

In [None]:
rmse = mse ** (1/2)

In [None]:
rmse

In [None]:
proj_df['squared_error'] = (proj_df['proj_ast'] - proj_df['ast'])**(2)

In [None]:
mse = proj_df['squared_error'].mean()

In [None]:
rmse = mse ** (1/2)

In [None]:
rmse

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
player_proj = pd.read_csv('../nba-stats-csv/player_proj_df.csv')
player_info = pd.read_csv('../nba-stats-csv/player_info_df.csv')

In [None]:
player_proj.sample(5)

In [None]:
df_real = player_proj.loc[:, ['pts','min','fgm','fga','fg3m','fg3a','ftm','fta','oreb','dreb','ast','tov','blk']]

In [None]:
df_proj = player_proj.loc[:, ['proj_pts','proj_min','proj_fgm','proj_fga','proj_fg3m','proj_fg3a','proj_ftm','proj_fta','proj_oreb','proj_dreb','proj_ast','proj_tov','proj_blk']]

In [None]:
df_real.sample(5)

In [None]:
df_proj.sample(5)

In [None]:
se = mean_squared_error(df_real, df_proj)

In [None]:
mse = np.mean(se)

In [None]:
rmse = np.sqrt(mse)

In [None]:
confidence = rmse

In [None]:
print('{0} percent confidence in projected {1} per game stats'.format(100 - round(confidence, 2), '2018-19'))

In [None]:
df_real_stats = pd.concat([player_info, df_real], axis=1)

In [None]:
df_real_stats.sample(5)

In [None]:
df_real_stats.rename(columns =
                    {'proj_season_id' : 'season_id'}, inplace=True)

In [None]:
df_proj_stats = pd.concat([player_info, df_proj], axis=1)

In [None]:
df_proj_stats.rename(columns =
                     {
                         'proj_season_id' : 'season_id',
                         'proj_pts' : 'pts',
                         'proj_min' : 'min',
                         'proj_fgm' : 'fgm',
                         'proj_fga' : 'fga',
                         'proj_fg3m' : 'fg3m',
                         'proj_fg3a' : 'fg3a',
                         'proj_ftm' : 'ftm',
                         'proj_fta' : 'fta',
                         'proj_oreb' : 'oreb',
                         'proj_dreb' : 'dreb',
                         'proj_ast' : 'ast',
                         'proj_stl' : 'stl',
                         'proj_tov' : 'tov',
                         'proj_blk' : 'blk'
                     }, inplace=True
                    )

In [None]:
display(df_real_stats.head(5)) # Shows normalized projected stats for the next season

In [None]:
display(round(df_proj_stats, 1).head(5))

# Comparing to Competitors

In [None]:
import pandas as pd
import numpy as np

In [None]:
player_proj = pd.read_csv('../nba-stats-csv/player_proj_df.csv')

In [None]:
columns = [
    'player_name',
    'player_id',
    'proj_pts',
    'proj_min',
    'proj_fgm',
    'proj_fga',
    'proj_fg3m',
    'proj_fg3a',
    'proj_ftm',
    'proj_fta',
    'proj_oreb',
    'proj_dreb',
    'proj_ast',
    'proj_stl',
    'proj_tov',
    'proj_blk'                     
]

In [None]:
df_proj_final = player_proj.loc[:, columns]

In [None]:
df_proj_final['proj_reb'] = df_proj_final['proj_oreb'] + df_proj_final['proj_dreb']

In [None]:
df_proj_final['proj_fg%'] = df_proj_final['proj_fgm'] / df_proj_final['proj_fga']
df_proj_final['proj_ft%'] = df_proj_final['proj_ftm'] / df_proj_final['proj_fta']

In [None]:
df_proj_final.tail(5)

In [None]:
final_columns = [
    'player_name',
    'player_id',
    'proj_pts',
    'proj_reb',
    'proj_ast',
    'proj_blk',
    'proj_stl',
    'proj_fg%',
    'proj_ft%',
    'proj_fg3m',
    'proj_min',
    'proj_tov'                 
]

In [None]:
df_proj_final = df_proj_final[final_columns]

In [None]:
df_competitor = pd.read_csv('../nba-stats-csv/ESPN_CBS_FantasyPros_Fantasy_Basketball_Overall_2018_Average_Projections.csv')

In [None]:
df_competitor.sample(5)

In [None]:
columns_to_drop = [
'Team',
'Positions',
'GP'
]

In [None]:
df_competitor.drop(columns = columns_to_drop, inplace = True)

In [None]:
df_competitor.sample(5)

In [None]:
lowercase_names = df_competitor['Player'].str.lower()

In [None]:
df_competitor['Player'] = lowercase_names

In [None]:
df_competitor.sample(5)

In [None]:
player_info = pd.read_csv('../nba-stats-csv/player_info_df.csv')


In [None]:
season = player_info['proj_season_id'] == '2018-19'

In [None]:
player_df = player_info[season]

In [None]:
lowercase = player_df['player_name'].str.lower()

In [None]:
player_df['player_name'] = lowercase

In [None]:
player_df.sample(5)

In [None]:
competitor_merged = pd.merge(df_competitor,
        player_df[['player_name', 'player_id']],
        how='left',
        left_on='Player',
        right_on='player_name'
        ).drop_duplicates().reset_index(drop=True)

In [None]:
competitor_merged.dropna(how='any', inplace=True)

In [None]:
competitor_merged.sample(5)

In [None]:
competitor_merged.dtypes

In [None]:
player_ids = competitor_merged['player_id'].astype(int)

In [None]:
competitor_merged['player_id'] = player_ids

In [None]:
competitor_merged

In [None]:
del competitor_merged['Player']

In [None]:
competitor_merged

In [None]:
ordered_columns = [
    'player_name',
    'player_id',
    'PTS',
    'REB',
    'AST',
    'BLK',
    'STL',
    'FG%',
    'FT%',
    '3PM',
    'MIN',
    'TO'              
]

In [None]:
competitor_merged = competitor_merged[ordered_columns]

In [None]:
competitor_merged

In [None]:
columns = [
    'player_name',
    'player_id',
    'pts',
    'min',
    'fgm',
    'fga',
    'fg3m',
    'fg3a',
    'ftm',
    'fta',
    'oreb',
    'dreb',
    'ast',
    'stl',
    'tov',
    'blk'                     
]


In [None]:
df_real = player_proj.loc[:, columns]

In [None]:
df_real['fg%'] = df_real['fgm'] / df_real['fga']
df_real['ft%'] = df_real['ftm'] / df_real['fta']
df_real['reb'] = df_real['oreb'] + df_real['dreb']

In [None]:
final_real_columns = [
    'player_name',
    'player_id',
    'pts',
    'reb',
    'ast',
    'blk',
    'stl',
    'fg%',
    'ft%',
    'fg3m',
    'min',
    'tov'                 
]

In [None]:
df_real_final = df_real[final_real_columns]

In [None]:
df_real_final.head(5)

In [None]:
competitor_final = pd.merge(competitor_merged, df_real_final, how='left', on='player_id')

In [None]:
competitor_final.sample(5)

In [None]:
competitor_proj = competitor_final.dropna(how='any')

In [None]:
ordered_cols = [
    'pts',
    'reb',
    'ast',
    'blk',
    'stl',
    'fg%',
    'ft%',
    'fg3m',
    'min',
    'tov'                 
]
df_real = competitor_proj.loc[:, ordered_cols]

In [None]:
ordered_columns_uppercase = [
    'PTS',
    'REB',
    'AST',
    'BLK',
    'STL',
    'FG%',
    'FT%',
    '3PM',
    'MIN',
    'TO'              
]
df_proj = competitor_proj.loc[:, ordered_columns_uppercase]

In [None]:
# Calculate mean square error
line_mse = mean_squared_error(df_real, df_proj)
line_rmse = np.mean(line_mse)
confidente = np.sqrt(line_rmse)
print('{0} percent confidence in projected {1} per game stats'.format(100 - round(confidence, 2), '2018-19'))

In [None]:
df_proj_final.sample(5)

In [None]:
df_real_final.sample(5)

In [None]:
df_real.fillna(0, inplace=True)
df_proj.fillna(0, inplace=True)

model_final = pd.merge(df_proj_final, df_real_final, how='left', on='player_id')

In [None]:
model_final

# Modifying Variables

In [None]:
def player_comparison_tool(df, current_player_season, current_player_id):
    if (((df['season_id'] == current_player_season) & (df['player_id'] == current_player_id)).any() == False):
        print('Can\'t find player with id {0} and season {1}'.format(current_player_id, current_player_season))
        return
    # might need to return None if so
    for row in df.itertuples():
        if current_player_season == row.season_id and current_player_id == row.player_id:
            current_player_id = row.player_id
            break

    current_player_vector = np.array([
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'pts_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'min_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'fgm_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'fga_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'fg3m_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'fg3a_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'ftm_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'fta_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'oreb_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'dreb_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'ast_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'stl_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'tov_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'blk_norm']).item()
    ])

    print('Projecting player_id {0} for season {1}'.format(current_player_id, season_list[(season_list.index(row.season_id) + 1)]))
    
    # create a list to store the data
    player_distance = []

    # loop over every row in the dataframe to calculate percent error
    weighted_numbers = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    for row in df.itertuples():
        compared_player_vector = np.array([
        row.pts_norm,
        row.min_norm,
        row.fgm_norm,
        row.fga_norm,
        row.fg3m_norm,
        row.fg3a_norm,
        row.ftm_norm,
        row.fta_norm,
        row.oreb_norm,
        row.dreb_norm,
        row.ast_norm,
        row.stl_norm,
        row.tov_norm,
        row.blk_norm
        ])
        
        vfunc = np.vectorize(calc_distance)
        distance_vect = vfunc(current_player_vector, compared_player_vector)
        weighted_distance = distance_vect * weighted_numbers
        number = np.sum(weighted_distance)
        player_distance.append(number)
        
    # create a new column with error 
    df['distance'] = player_distance

    # sort dataframe by smallest distance
    ranked_df = df.sort_values('distance')
    
    stats = ['pts',
             'min',
             'fgm',
             'fga',
             'fg3m',
             'fg3a',
             'ftm',
             'fta',
             'oreb',
             'dreb',
             'ast',
             'stl',
             'tov',
             'blk'
             ]
    
    # create empty dictionary to put in projected stats
    projected_stats = {}

    for col in stats:
        sum_stat = 0
        sum_weight = 0
        for index, row in ranked_df.iloc[1:11].iterrows():
            # skip over the row if it was 2017-18 season because we can't take the next
            if row.season_id == '2017-18':
                continue
            # get the players next season
            weight = (1 / row.distance)
            next_season = season_list[(season_list.index(row.season_id) + 1)]
            # find the player row with the id and the next season
            player_next_season = find_player(ranked_df, row.player_id, next_season)
            # if player_next_season doesn't exist then skip
            if player_next_season == None:
                continue
            sum_stat += getattr(player_next_season, col) * weight
            sum_weight += weight
        projected_stats['player_id'] = current_player_id
        projected_stats['proj_season_id'] = season_list[(season_list.index(current_player_season) + 1)]
        projected_stats['proj_' + col] = (sum_stat / sum_weight)
    return projected_stats

In [None]:
# read in per game data from csv folder
df = pd.read_csv('../nba-stats-csv/player_general_traditional_per_game_data.csv', header = 0)

In [None]:
df