# 3pt data preprocessing

In [1]:
import numpy as np
import pandas as pd
import sys
from copy import deepcopy

In [2]:
# sys.path.append(r"/Users/fordfishman/GitHub/3pt-shooting/code/python/")
sys.path.append(r"/home/fordfishman/GitHub/3pt-shooting/code/python/")
from basketball_reference_scraper.teams import get_roster_stats



Set the seasons we desire and the teams as well. The seasons are 2010-2021, with each season going by the year at the time of the playoffs.

In [3]:
year_range = list( range(2010, 2022) ) 
team_abr = [
    'ATL',
    'BRK',
    'BOS',
    'CHA',
    'CHI',
    'CLE',
    'DAL',
    'DEN',
    'DET',
    'GSW',
    'HOU',
    'IND',
    'LAC',
    'LAL',
    'MEM',
    'MIA',
    'MIL',
    'MIN',
    'NOP',
    'NYK',
    'OKC',
    'ORL',
    'PHI',
    'PHO',
    'POR',
    'SAC',
    'SAS',
    'TOR',
    'UTA',
    'WAS',
    ]




In [4]:
def player_index(names, seasons):

    indices = list()

    for i in range(len(names)):

        name = names[i].replace(' ', '')
        indices.append( '%s%s' % (name, seasons[i]) )

    return indices
    
def get_data(mode):

    df = get_roster_stats('GSW', 2021, mode)
    df = pd.DataFrame().reindex(columns=df.columns)

    for year in year_range:
        for team in team_abr:

            df_i = get_roster_stats(team, year, mode)
            df_i['index'] = player_index(df_i.PLAYER, df_i.SEASON)
            df = df.append(df_i, ignore_index=True)

    return df


Run API and save raw data as .csv files.

In [5]:
df_adv = get_data('ADVANCED')
df_tot = get_data('TOTALS')
df_poss = get_data('PER_POSS')
df_adv.to_csv("~/GitHub/3pt-shooting/data/allplayer_adv.csv", index=False)
df_poss.to_csv("~/GitHub/3pt-shooting/data/allplayer_poss.csv", index=False)
df_tot.to_csv("~/GitHub/3pt-shooting/data/allplayer_tot.csv", index=False)

Read in data (if not already done with API)

In [71]:
# df_poss = pd.read_csv("~/GitHub/3pt-shooting/data/allplayer_poss.csv")
# df_adv = pd.read_csv("~/GitHub/3pt-shooting/data/allplayer_adv.csv")
# df_tot = pd.read_csv("~/GitHub/3pt-shooting/data/allplayer_tot.csv")

Remove columns with no data.

In [72]:
df_tot = df_tot.drop(['Unnamed: 29', 'DRtg', 'ORtg'], axis=1)
df_poss = df_poss.drop(['Unnamed: 29'], axis=1)
df_adv = df_adv.drop(['Unnamed: 19', 'Unnamed: 24'], axis=1)

remove duplicate columns, print out duplicates

In [73]:
for col in df_tot:
    
    if col in df_poss and df_tot[col].equals(df_poss[col]):

        df_poss = df_poss.drop([col], axis=1)
        
    if col in df_adv and df_tot[col].equals(df_adv[col]):
        
        df_adv = df_adv.drop([col], axis=1)


rename possession columns

In [11]:
df_poss = df_poss.add_suffix('_poss')


combine data frames

In [12]:
df_totposs = df_tot.join(df_poss, how='outer')
df_all = df_totposs.join(df_adv, how='outer')
df_all.columns


Index(['PLAYER', 'POS', 'AGE', 'TEAM', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'SEASON', 'eFG%',
       'index', 'FG_poss', 'FGA_poss', '3P_poss', '3PA_poss', '2P_poss',
       '2PA_poss', 'FT_poss', 'FTA_poss', 'ORB_poss', 'DRB_poss', 'TRB_poss',
       'AST_poss', 'STL_poss', 'BLK_poss', 'TOV_poss', 'PF_poss', 'PTS_poss',
       'ORtg_poss', 'DRtg_poss', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%',
       'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS',
       'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'],
      dtype='object')

In [13]:
df_trade = deepcopy(df_all)

idents = dict() # dictionary of id to list of indexes

for i, row in df_all.iterrows():

    ident = row['index']

    if not ident in idents:
        idents[ident] = [i]

    else:
        idents[ident].append(i)




need to fix so that each player year is a single row

In [14]:
num = df_trade.select_dtypes(include=['float64','int64']).columns
char = df_trade.select_dtypes(include=['object']).columns
totals = pd.Index(['G', 'GS', 'MP', 'FG', 'FGA', 
       '3P', '3PA', '2P', '2PA',  'FT', 'FTA', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'])
aves_list = list()

for i in num:
    if i not in totals:
        aves_list.append(i)
aves = pd.Index(aves_list)

all_inds = list()

i = 0
for ident, inds in idents.items():

    inds = list(inds)

    if len(inds) > 1: # if a player was moved
        all_inds += inds
        
        char_i = df_trade.loc[inds, char].reset_index(drop=True)
        tot_i = df_trade.loc[inds, totals].reset_index(drop=True)
        ave_i = df_trade.loc[inds, aves].reset_index(drop=True)

        char_new = char_i.loc[0,:]
        tot_new = tot_i.sum(axis=0)

        games = np.sum(tot_new.G)
        games_r = tot_i.G/games # ratio of games played
        ave_new = np.matmul(games_r, ave_i)

        new_row = pd.concat([char_new, tot_new, ave_new], axis=0)
        # print(new_row)
        
        df_trade = df_trade.append(new_row, ignore_index = True)

df_trade = df_trade.drop(index=all_inds).reset_index(drop=True)



# print(m)

  ave_new = np.matmul(games_r, ave_i)


remove seasons below 100 3PTA

In [33]:
df_limit = df_trade.loc[df_trade['3PA']>=100].reset_index(drop=True)

Add information about next year's shooting

In [35]:

seasons_sorted = sorted(list(set(df_limit.SEASON)))
player_dict = dict()

next_3pt = list()
next_imp = list()

for i in range(len(df_limit)):

    player = df_limit.loc[i, "PLAYER"]
    season = df_limit.loc[i, "SEASON"]
    pct = df_limit.loc[i, "3P%"]

    if player not in player_dict:
        player_dict[player] = {sea: np.nan for sea in seasons_sorted}
        player_dict[player][season] = pct
    
    else:
        player_dict[player][season] = pct

seasons_to_keep = dict() # a dictionary of season IDs we will keep

for player, seasons in player_dict.items():

    for i, season in enumerate(seasons_sorted): # iterate over all seasons in order

        if i == len(seasons_sorted) - 1: #if its the last season
            break

        season_1 = seasons_sorted[i+1]

        if not np.isnan(seasons[season]) and not np.isnan(seasons[season_1]):  

           name = player.replace(' ', '')
           index = '%s%s' % (name, season)
           seasons_to_keep[index] = seasons[season_1]

            


Add in next season values. This, along with the differential will be the targets.

In [68]:
df_limit['next_3P%'] = np.nan

for ident, pct in seasons_to_keep.items():
    i = df_limit[df_limit['index']==ident].index
    df_limit.loc[i, 'next_3P%'] = pct

df_final = df_limit.dropna(subset=['next_3P%'])
df_final['3pt_dif'] = df_final['next_3P%'] - df_final['3P%'] # differential between this and last seasons shooting percentage

change season to be a numerical

In [69]:
def season_num(season):
    year = season[-2:]
    return int( '20%s' % year )

season_v = map(season_num, df_final.SEASON)
df_final['SEASON'] = list(season_v)

Save final data frame

In [74]:
df_final.to_csv("~/GitHub/3pt-shooting/data/preprocessed.csv", index=False)