In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os, re
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [None]:
from google.colab import drive
drive.mount("/content/drive")
GOOGLE_PATH = "drive/MyDrive/SI671/project/"

In [None]:
path = GOOGLE_PATH + "datasets/pbp_dataset"

In [None]:
pbp_data=[]
for file_name in os.listdir(path):
    if file_name.endswith("_pbp.csv"):
        season = file_name.split("_")[0]
        df = pd.read_csv(file_name)
        df['season'] = season
        pbp_data.append(df)

In [None]:
pbp_data = pd.concat(pbp_data, ignore_index=True)

In [None]:
player = pd.read_csv(path + "/NBA-playerlist.csv")
player_data = pd.read_csv(path + "/player_data.csv")

In [None]:
player = player[["DISPLAY_FIRST_LAST", "PERSON_ID"]]
player.columns = ["name", "id"]

In [None]:
pbp_2000_01= pd.read_csv(path+"/2000-01_pbp.csv")

In [None]:
pbp_2001_02['HOMEDESCRIPTION'].dropna().unique()

In [None]:
interest_action = ['PTS', 'MISS']
pattern = '|'.join(interest_action)
pbp_2001_02_shoot = pbp_2001_02[pbp_2001_02['HOMEDESCRIPTION'].astype(str).str.contains(pattern, na=False) |
                                pbp_2001_02['VISITORDESCRIPTION'].astype(str).str.contains(pattern, na=False)]


if one player block another player's shot, the player1 would be the shooter and the player3 would be blocker

we have six different score type ['Shot', 'Throw', 'Layup', 'Dunk', 'Fadeaway', 'Finger Roll']
shot (usually mentioned as jump shot) and fade away would have the distance information and we mark the layup and finger roll as the same type.


In [None]:
score_type = ['Shot', 'Throw', 'Layup', 'Dunk', 'Fadeaway', 'Finger Roll']
miss = 'MISS'
make = 'PTS'

In [None]:
interest_action = ['PTS', 'MISS']
pattern = '|'.join(interest_action)
def extract_pattern(dataset, pattern):
    dataset = dataset[dataset['HOMEDESCRIPTION'].astype(str).str.contains(pattern, na=False) |
                    dataset['VISITORDESCRIPTION'].astype(str).str.contains(pattern, na=False)]
    
    return dataset

In [None]:
def extract_shot_player(dataset):
    dataset['player_name'] = dataset.apply(lambda row: row[f'PLAYER1_NAME'], axis=1)
    dataset['player_id'] = dataset.apply(lambda row: row[f'PLAYER1_ID'], axis=1)
    dataset['team_abbr'] = dataset.apply(lambda row: row['PLAYER1_TEAM_ABBREVIATION'], axis=1)
    dataset['team_id'] = dataset.apply(lambda row: row['PLAYER1_TEAM_ID'], axis=1)
    dataset['team_nickname'] = dataset.apply(lambda row: row['PLAYER1_TEAM_NICKNAME'], axis=1)
    return dataset

In [None]:
def extract_score_type(dataset, score_type):
    for play in score_type:
        dataset[play] = dataset.apply(lambda row: int(play in (str(row['HOMEDESCRIPTION']) or '') or play in (str(row['VISITORDESCRIPTION']) or '')), axis=1)
    return dataset

In [None]:
def extract_distance(desc):
    if desc:
        match = re.search(r'(\d+)\'', desc)
        return int(match.group(1)) if match else 0
    return 0

def extract_distance_df(dataset):
    dataset['shot_distance'] = dataset.apply(lambda row: extract_distance(str(row['HOMEDESCRIPTION'])) or 
                                          extract_distance(str(row['VISITORDESCRIPTION'])), axis=1)
    return dataset

In [None]:
def extract_make_or_miss(dataset):
    dataset['make'] = ((dataset['HOMEDESCRIPTION'].str.contains('PTS', na=False)) |
                       (dataset['VISITORDESCRIPTION'].str.contains('PTS', na=False))).astype(int)
    dataset['miss'] = ((dataset['HOMEDESCRIPTION'].str.contains('MISS', na=False)) |
                       (dataset['VISITORDESCRIPTION'].str.contains('MISS', na=False))).astype(int)
    return dataset

In [None]:
def extract_pts_type(dataset):
    dataset['3PT'] = ((dataset['HOMEDESCRIPTION'].str.contains('3PT', na=False) & dataset['HOMEDESCRIPTION'].str.contains('Shot', na=False)) |
                (dataset['VISITORDESCRIPTION'].str.contains('3PT', na=False) & dataset['VISITORDESCRIPTION'].str.contains('Shot', na=False))).astype(int)

    dataset['FT'] = ((dataset['HOMEDESCRIPTION'].str.contains('Free Throw', na=False)) |
                (dataset['VISITORDESCRIPTION'].str.contains('Free Throw', na=False))).astype(int)

    dataset['2PT'] = ((dataset['3PT'] == 0) & (dataset['FT'] == 0)).astype(int)
    return dataset


In [None]:
def extract_interest_col(dataset, interest_cols, pattern, score_type):
    df = dataset.copy()
    df = extract_pattern(df, pattern)
    df = extract_make_or_miss(df)
    df = extract_score_type(df, score_type)
    df = extract_shot_player(df)
    df = extract_distance_df(df)
    df = extract_pts_type(df)
    return df[interest_cols]

In [None]:
interest_cols = ['GAME_ID', 'HOMEDESCRIPTION', 'VISITORDESCRIPTION', 'make','miss', 'player_name', 'player_id', 'team_abbr', 'team_id', 'team_nickname', 'Shot', 'Throw', 'Layup', 'Dunk', 'Fadeaway', 'Finger Roll', 'shot_distance', '2PT', '3PT', 'FT']

# Extract all season's play detail

2019-20 data in another format

In [None]:
pbp_filepath=[]
for file_name in os.listdir(path):
    if file_name.endswith("_pbp.csv"):
        pbp_filepath.append(path+"/"+file_name)

In [None]:
pbp_filepath.remove(path+"/2019-20_pbp.csv")

In [None]:
shooting_list=[]
for season, filepath in enumerate(pbp_filepath):
    df = pd.read_csv(filepath)
    shooting = extract_interest_col(df, interest_cols, pattern, score_type)
    shooting['season']=season+2000
    shooting_list.append(shooting)

In [None]:
players = []
for i in range(2000,2020):
    print(i)
    player=player_data[(player_data['year_end']>=i)&(player_data['year_start']<=i)]
    players.append(player)
    print(len(player[player['name'].duplicated()]))

## Deal with duplicate values for 08, 09, 13, 14

In [None]:
shooting_list[8].loc[(shooting_list[8]['player_name']=='Marcus Williams')&(shooting_list[8]['team_abbr']=='GSW'),'player_name']='Marcus Williams_UConn'

In [None]:
players[8].loc[(players[8]['name']=='Marcus Williams')&(players[8]['college']=='University of Connecticut'),'name']='Marcus Williams_UConn'

In [None]:
players[8][players[8]['name']=='Marcus Williams']

In [None]:
shooting_list[9][shooting_list[9]['player_name']=='Marcus Williams']['team_abbr'].unique()

In [None]:
players[9][players[9]['name']=='Marcus Williams']

In [None]:
players[9]

In [None]:
players[9]=players[9].drop(4393)

In [None]:
players[9][players[9]['name']=='Marcus Williams']

In [None]:
players[13][players[13]['name'].duplicated()]

In [None]:
players[13][players[13]['name']=='Chris Wright']

In [None]:
players[13][players[13]['name']=='Chris Johnson']

In [None]:
players[13]=players[13].drop(1999)

In [None]:
players[13]=players[13].drop(4496)

In [None]:
#only leave Chris Johnson from Dayton
shooting_list[13][shooting_list[13]['player_name']=='Chris Johnson']['team_abbr'].unique()

In [None]:
shooting_list[13][shooting_list[13]['player_name']=='Chris Wright']['team_abbr'].unique()

In [None]:
players[14][players[14]['name'].duplicated()]

In [None]:
shooting_list[14][shooting_list[14]['player_name']=='Tony Mitchell']['team_abbr'].unique()

In [None]:
shooting_list[0][shooting_list[0]['Fadeaway']==1]

In [None]:
shooting_list[1][shooting_list[1]['Fadeaway']==1]

In [None]:
shooting_list[0]

# All cleaned, start analysis

In [None]:
all_season = []
for i in range(0,19):
    merged= pd.merge(shooting_list[i],players[i][['name','position']],how='left',left_on='player_name',right_on='name')
    all_season.append(merged)

In [None]:
df = pd.concat(all_season,ignore_index=True)

In [None]:
df['position'].info()

In [None]:
df.columns

In [None]:
all_shootings = df[['make', 'miss','Shot','Throw','Layup','Dunk','Fadeaway','Finger Roll','shot_distance','2PT', '3PT', 'FT','season', 'name', 'position']]

In [None]:
all_shootings.describe()

In [None]:
all_shootings['position'].fillna('N',inplace=True)

In [None]:
all_shootings.columns

In [None]:
all_shootings[(all_shootings['Shot']==1) & (all_shootings['Layup']==1) &(all_shootings['Dunk']==1) & (all_shootings['Fadeaway']==1) &(all_shootings['Finger Roll']==1) & (all_shootings['Throw']==0)]

In [None]:
all_shootings[(all_shootings['Finger Roll']==0) & (all_shootings['Layup']==1)]

In [None]:
all_shootings['position']=all_shootings['position'].apply(lambda x: x.split('-')[0])

In [None]:
shooting_byseason=all_shootings.groupby(['season']).sum()

In [None]:
shooting_byseason.reset_index(inplace=True)

In [None]:
shooting_byseason.set_index('season',inplace=True)

In [None]:
shooting_byseason['Field_Goal']=shooting_byseason['2PT']+shooting_byseason['3PT']

In [None]:
shooting_byseason['Shot']+shooting_byseason['Layup']+shooting_byseason['Dunk']+shooting_byseason['Fadeaway']+shooting_byseason['Finger Roll']

In [None]:
shooting_byseason

In [None]:
shootings_byposition=all_shootings.groupby(['season','position']).sum()

In [None]:
shootings_byposition.reset_index(inplace=True)

In [None]:
shootings_byposition

In [None]:
shootings_byposition

In [None]:
shootings_byposition['layup_all']=shootings_byposition['Layup']+shootings_byposition['Finger Roll']
shootings_byposition['shot_all']=shootings_byposition['Shot']+shootings_byposition['Fadeaway']


In [None]:
shootings_byposition['all_attempt']=shootings_byposition['2PT']+shootings_byposition['3PT']

In [None]:
position_count=all_shootings[['season','position','Shot']].groupby(['season','position']).count()
position_count.reset_index(inplace=True)
position_count

In [None]:
shootings_byposition=shootings_byposition.merge(position_count,on=['season','position'],how='left')
shootings_byposition.rename(columns={'Shot_y':'count','Shot_x':'Shot'},inplace=True)


In [None]:
shootings_byposition

In [None]:
makebytype=all_shootings.groupby(['season','position','make']).sum().reset_index()
makebytype=makebytype[makebytype['make']==1]
makebytype['layup_made']=makebytype['Layup']+makebytype['Finger Roll']
makebytype['shot_made']=makebytype['Shot']+makebytype['Fadeaway']
makebytype=makebytype[['season','position','shot_made','layup_made','Dunk','2PT','3PT','FT']]
makebytype.columns=['season','position','shot_made','layup_made','dunk_made','2PT_made','3Pt_made','FT_made']
makebytype

In [None]:
shootings_byposition=shootings_byposition.merge(makebytype,on=['season','position'],how='left')
shootings_byposition

In [None]:
shootings_byposition['2PT%']=shootings_byposition['2PT_made']/shootings_byposition['2PT']
shootings_byposition['3PT%']=shootings_byposition['3Pt_made']/shootings_byposition['3PT']
shootings_byposition['FT%']=shootings_byposition['FT_made']/shootings_byposition['FT']
shootings_byposition['layup%']=shootings_byposition['layup_made']/shootings_byposition['layup_all']
shootings_byposition['shot%']=shootings_byposition['shot_made']/shootings_byposition['shot_all']
shootings_byposition['dunk%']=shootings_byposition['dunk_made']/shootings_byposition['Dunk']


In [None]:
shootings_byposition['all%']=(shootings_byposition['2PT_made']+shootings_byposition['3Pt_made'])/(shootings_byposition['2PT']+shootings_byposition['3PT'])

In [None]:
shootings_byposition.columns

In [None]:
shootings_byposition

In [None]:
center = shootings_byposition[shootings_byposition['position']=='C']
f = shootings_byposition[shootings_byposition['position']=='F']
g = shootings_byposition[shootings_byposition['position']=='G']

In [None]:
def calc_wma(ser, wd_size, weights=1):
    if isinstance(weights, int):
        weights = np.full(wd_size, weights, dtype=float)

    wma = []
    for i in range(len(ser)):
        low, high = max(0, i - wd_size + 1), i + 1
        wma.append(np.average(ser.iloc[low: high], weights=weights[-(high - low):]))
    return np.array(wma)

In [None]:
def getRolling(wd_size,col):
    first_mmt = calc_wma(col, wd_size)
    #print(first_mmt)
    second_mmt = calc_wma(col ** 2, wd_size)
    #print(second_mmt)
    rolling_mean = first_mmt
    rolling_std = np.sqrt(second_mmt - first_mmt ** 2)
    return rolling_mean,rolling_std

In [None]:
field_goal_rm, field_goal_rstd=getRolling(3,shooting_byseason['Field_Goal'].astype('float'))
pt2_rm, pt2_rstd=getRolling(3,shooting_byseason['2PT'].astype('float'))
pt3_rm, pt3_rstd=getRolling(3,shooting_byseason['3PT'].astype('float'))
ft_rm,ft_rstd=getRolling(3,shooting_byseason['FT'].astype('float'))

In [None]:
center['all_attempt']

In [None]:
center['layup_all']+center['Dunk']+center['shot_all']

# Overall trend

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(15, 10), sharex=True)
axes[0].set_title("Field Goal Attempts")
axes[0].plot(shooting_byseason['Field_Goal'],label='Field Goal')
axes[0].plot(pd.Series(field_goal_rm, index=shooting_byseason.index), label="Rolling Mean")
axes[0].legend(loc='upper left')
#axes[0].plot(pd.Series(field_goal_rstd, index=shooting_byseason.index), label="Rolling Std")
axes[0].set_ylabel("Attempts")

axes[1].set_title("2-Point Shoot Attempts")
axes[1].plot(shooting_byseason['2PT'],label='2PT')
axes[1].plot(pd.Series(pt2_rm, index=shooting_byseason.index), label="Rolling Mean")
axes[1].set_ylabel("Attempts")
axes[1].legend(loc='upper left')

axes[2].set_title("3-Point Shoot Attemps")
axes[2].plot(shooting_byseason['3PT'],label='3PT')
axes[2].plot(pd.Series(pt3_rm, index=shooting_byseason.index), label="Rolling Mean")
axes[2].set_ylabel("Attempts")
axes[2].legend(loc='upper left')

axes[3].set_title("Free Throw Attemps")
axes[3].plot(shooting_byseason['FT'],label='FT')
axes[3].plot(pd.Series(ft_rm, index=shooting_byseason.index), label="Rolling Mean")
axes[3].set_ylabel("Attempts")
axes[3].legend(loc='upper left')

axes[3].set_xticks(shooting_byseason.index)
fig.suptitle("Shooting Attempts League Total", x=0.513, y=0.95)
plt.savefig('1')

According to the observe and rolling mean plot for each shooting category, we can see that since 2000, the overall field goal attempts and 3-point shooting attempts have an increasing trend, and the 2-points shoot attempts and free throw attempts have an decreasng trend, which indicates that the overall shooting for the league is increasing, but player tends to shoot more 3-pointers instead of 2-pointers and the decreasing trend for free throw might indicate that the players are less inclined to attack the basket than before. 

# draw time series plot for three positions

In [None]:
center.head()

In [None]:
center.set_index('season',inplace=True)
f.set_index('season',inplace=True)
g.set_index('season',inplace=True)

In [None]:
center

In [None]:
f

In [None]:
g

In [None]:
center['all_attempt']/center['count']

In [None]:
f['all_attempt']/f['count']

In [None]:
g['all_attempt']/g['count']

In [None]:
(g['all_attempt']/g['count'])-(f['all_attempt']/f['count'])

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(15, 10), sharex=True)
axes[0].set_title("Field Goal Attemps Per Player By Position")
axes[0].plot(center['all_attempt']/center['count'],label='Center')
axes[0].plot(f['all_attempt']/f['count'],label='Forward')
axes[0].plot(g['all_attempt']/g['count'],label='Guard')
axes[0].set_ylabel("Attempts")
axes[0].legend(loc='upper left')

axes[1].set_title("2-Points Attemps Per Player By Position")
axes[1].plot(center['2PT']/center['count'],label='Center')
axes[1].plot(f['2PT']/f['count'],label='Forward')
axes[1].plot(g['2PT']/g['count'],label='Guard')
axes[1].set_ylabel("Attempts")
axes[1].legend(loc='upper left')

axes[2].set_title("3-Points Attemps Per player By Position")
axes[2].plot(center['3PT']/center['count'],label='Center')
axes[2].plot(f['3PT']/f['count'],label='Forward')
axes[2].plot(g['3PT']/g['count'],label='Guard')
axes[2].set_ylabel("Attempts")
axes[2].legend(loc='upper left')

axes[3].set_title("Free Throw Attemps Per player By Position")
axes[3].plot(center['FT']/center['count'],label='Center')
axes[3].plot(f['FT']/f['count'],label='Forward')
axes[3].plot(g['FT']/g['count'],label='Guard')
axes[3].set_ylabel("Attempts")
axes[3].legend(loc='upper left')

axes[3].set_xticks(center.index)
fig.suptitle("Shooting Attempts Per Player By Postion", x=0.513, y=0.95)

We then want to research about the shooting attempts for each position and the tactical priority for each position. because the player counts for each postion are imbalanced, so we decided to use Attempts Per Player(APP) to represent the amount of shooting attempts for each position. As the plot shows, Guard has the highest APP for Field Goals among all the positions, while center has the lowest APP for Field Goals, this ties with our common sense about basketball because in a game guard is dominating the ball and mostly guards are the position who are responisble for scoring. While there is a interesting finding that the different between Guards and Forwards are becoming smaller, this can probably indicate a changing in tactics that more and more team are incline to let forwards to do the finishing. 
And for all the postions, there is a decreasing trend for 2-Pointers APP and Free Throw APP, while there is an increasing trend for 3_pointers APP, this also shows that teams in NBA tends to make more 3 shootiing attempts than attacking the basket.  

In [None]:
center

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(15, 10), sharex=True)
axes[0].set_title("Field Goal hit% By Position")
axes[0].plot(center['all%']*100,label='Center')
axes[0].plot(f['all%']*100,label='Forward')
axes[0].plot(g['all%']*100,label='Guard')
axes[0].set_ylabel("Shooting%")
axes[0].legend(loc='upper left')

axes[1].set_title("2-Points hit% By Position")
axes[1].plot(center['2PT%']*100,label='Center')
axes[1].plot(f['2PT%']*100,label='Forward')
axes[1].plot(g['2PT%']*100,label='Guard')
axes[1].set_ylabel("Shooting%")
axes[1].legend(loc='upper left')

axes[2].set_title("3-Points hit% By Position")
axes[2].plot(center['3PT%']*100,label='Center')
axes[2].plot(f['3PT%']*100,label='Forward')
axes[2].plot(g['3PT%']*100,label='Guard')
axes[2].set_ylabel("Shooting%")
axes[2].legend(loc='upper left')

axes[3].set_title("Free hit% By Position")
axes[3].plot(center['FT%']*100,label='Center')
axes[3].plot(f['FT%']*100,label='Forward')
axes[3].plot(g['FT%']*100,label='Guard')
axes[3].set_ylabel("Shooting%")
axes[3].legend(loc='upper left')

axes[3].set_xticks(center.index)
fig.suptitle("Shooting hit% By Postion", x=0.513, y=0.95)

Among all the positions, center has the highest overall field goal and 2 point hit rate followed by forward and then guard,and for 3 point shoots, we can detect a clear increasing trend for center's 3 point hit rate, at the season of 2000-2001, center's 3 point hit rate is extremely low compare to other two positions, and it keep increasing and after 2013-2014 season, all three position's 3 point hit rate are very close to each other. This indicates that the 3 point shooting techniques for centers are much better compare to ealier seasons.

In [None]:
(center['shot_all']/center['all_attempt'])*100,(center['layup_all']/center['all_attempt'])*100,(center['Dunk']/center['all_attempt'])*100

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(15, 10), sharex=True)
axes[0].set_title("Shooting type for Center")
axes[0].plot((center['shot_all']/center['all_attempt'])*100,label='Shot')
axes[0].plot((center['layup_all']/center['all_attempt'])*100,label='Layup')
axes[0].plot((center['Dunk']/center['all_attempt'])*100,label='Dunk') 
axes[0].set_ylabel("%")
axes[0].set_yticks([0,20,40,60,80])
axes[0].legend(loc='upper left')

axes[1].set_title("Shooting type for Forward")
axes[1].plot(f['shot_all']/f['all_attempt']*100,label='Shot')
axes[1].plot(f['layup_all']/f['all_attempt']*100,label='Layup')
axes[1].plot(f['Dunk']/f['all_attempt']*100,label='Dunk') 
axes[1].set_ylabel("%")
axes[1].set_yticks([0,20,40,60,80])
axes[1].legend(loc='upper left')

axes[2].set_title("Shooting type for Guard")
axes[2].plot(g['shot_all']/g['all_attempt']*100,label='Shot')
axes[2].plot(g['layup_all']/g['all_attempt']*100,label='Layup')
axes[2].plot(g['Dunk']/g['all_attempt']*100,label='Dunk') 
axes[2].set_ylabel("%")
axes[2].set_yticks([0,20,40,60,80])
axes[2].legend(loc='upper left')

axes[2].set_xticks(center.index)
fig.suptitle("Type of shooting By Postion", x=0.513, y=0.95)

Then we analysed the 

In [None]:
center.columns

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(15, 10), sharex=True)
axes[0].set_title("Hit Rate For Center")
axes[0].plot((center['shot%'])*100,label='Shot')
axes[0].plot((center['layup%'])*100,label='Layup')
axes[0].plot((center['dunk%'])*100,label='Dunk') 
axes[0].set_ylabel("%")
axes[0].set_yticks([0,20,40,60,80])
axes[0].legend(loc='upper left')

axes[1].set_title("Hit Rate For Forward")
axes[1].plot(f['shot%']*100,label='Shot')
axes[1].plot(f['layup%']*100,label='Layup')
axes[1].plot(f['dunk%']*100,label='Dunk') 
axes[1].set_ylabel("%")
axes[1].set_yticks([0,20,40,60,80])
axes[1].legend(loc='upper left')

axes[2].set_title("Hit Rate For Guard")
axes[2].plot(g['shot%']*100,label='Shot')
axes[2].plot(g['layup%']*100,label='Layup')
axes[2].plot(g['dunk%']*100,label='Dunk') 
axes[2].set_ylabel("%")
axes[2].set_yticks([0,20,40,60,80])
axes[2].legend(loc='upper left')

axes[2].set_xticks(center.index)
fig.suptitle("Hit rate of shooting By Type and Postion", x=0.513, y=0.95)

We split all field goal attempts into three categories, Shot, Layup and Dunk, and we plot the percentage of each shooting types for each position in order to see what's the difference between each position's way of playing. 

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(15, 10), sharex=True)
axes[0].set_title("Shooting type for Center")
axes[0].plot((center['2PT']/center['all_attempt'])*100,label='2PT')
axes[0].plot((center['3PT']/center['all_attempt'])*100,label='3PT')
axes[0].set_ylabel("Attempts")
axes[0].set_yticks([0,20,40,60,80])
axes[0].legend(loc='upper left')

axes[1].set_title("Shooting type for Forward")
axes[1].plot((f['2PT']/f['all_attempt'])*100,label='2PT')
axes[1].plot((f['3PT']/f['all_attempt'])*100,label='3PT')
axes[1].set_ylabel("Attempts")
axes[1].set_yticks([0,20,40,60,80])
axes[1].legend(loc='upper left')

axes[2].set_title("Shooting type for Guard")
axes[2].plot((g['2PT']/g['all_attempt'])*100,label='2PT')
axes[2].plot((g['3PT']/g['all_attempt'])*100,label='3PT')
axes[2].set_ylabel("Attempts")
axes[2].set_yticks([0,20,40,60,80])
axes[2].legend(loc='upper left')

axes[2].set_xticks(center.index)
fig.suptitle("Type of shooting By Postion", x=0.513, y=0.95)

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(15, 10), sharex=True)
axes[0].set_title("Hit Rate For Center")
axes[0].plot(center['2PT%']*100,label='2PT')
axes[0].plot(center['3PT%']*100,label='3PT')
axes[0].set_ylabel("%")
axes[0].set_yticks([0,20,40,60])
axes[0].legend(loc='upper left')

axes[1].set_title("Hit Rate For Forward")
axes[1].plot(f['2PT%']*100,label='2PT')
axes[1].plot(f['3PT%']*100,label='3PT')
axes[1].set_ylabel("%")
axes[1].set_yticks([0,20,40,60])
axes[1].legend(loc='upper left')

axes[2].set_title("Hit Rate For Guard")
axes[2].plot(g['2PT%']*100,label='2PT')
axes[2].plot(g['3PT%']*100,label='3PT')
axes[2].set_ylabel("%")
axes[2].set_yticks([0,20,40,60])
axes[2].legend(loc='upper left')

axes[2].set_xticks(center.index)
fig.suptitle("Hit Rate By Postion", x=0.513, y=0.95)

In [None]:
center

In [None]:
f

In [None]:
g

The plot on the left shows the distribution of shooting types for each position. We can see that for all the positions, there is trend that the players are making less 2 point shooting attempts and making more 3 point shooting attempts. And the plot on the left shows the hit rate for different shooting types for each position. We can see that the hit rate for Forward and Guard are pretty stable, but the 3 point hit rate for centers has been increasing a lot.

In [None]:
all_shootings[(all_shootings['Throw']==1)&(all_shootings['Shot']==1)]

# Game by Game analysis

In [None]:
suffixes = ['make', 'miss', 'shot_distance']
score_type.append('2PT')
score_type.append('3PT')
score_type.append('FT')
for score in score_type:
  for suffix in suffixes:
    col = f"{score}_{suffix}"
    if suffix in ['make', 'miss']:
      df[col] = (df[suffix] == 1) & (df[score] == 1)
      df[col] = df[col].astype(int)
    elif (suffix == 'shot_distance') & (score in ['Shot', 'Fadeaway']):
      df[col] = df[suffix]

## score pattern

In [None]:
score_game = df.groupby('GAME_ID').sum()[['make', 'miss', 'Shot', 'Throw', 'Layup', 'Dunk', 'Fadeaway', 'Finger Roll', 'Shot_make', 'Shot_miss', 'Throw_make', 'Throw_miss', 'Layup_make', 'Layup_miss', 'Dunk_make', 'Dunk_miss', 'Fadeaway_make', 'Fadeaway_miss', 'Finger Roll_make', 'Finger Roll_miss','Shot_shot_distance', 'Fadeaway_shot_distance','2PT', '3PT', 'FT', '2PT_make', '2PT_miss', '3PT_make', '3PT_miss', 'FT_make', 'FT_miss']]
score_game.reset_index(inplace=True)

In [None]:
def calc_wma(ser, wd_size, weights=1):
    if isinstance(weights, int):
        weights = np.full(wd_size, weights, dtype=float)

    wma = []
    for i in range(len(ser)):
        low, high = max(0, i - wd_size + 1), i + 1
        wma.append(np.average(ser.iloc[low: high], weights=weights[-(high - low):]))
    return np.array(wma)

In [None]:
def draw_game_pattern(data, window_size, pattern, y_lim_inf = 0, y_lim_sup = 0, log_ret = False):
  if log_ret:
    fig, axes = plt.subplots(2, 1, figsize=(10, 12), sharex=True)

    axes[0].set_xlabel("match")
    wd_size = window_size
    first_mmt = calc_wma(data, wd_size)
    second_mmt = calc_wma((data) ** 2, wd_size)
    rolling_mean = first_mmt
    rolling_std = np.sqrt(second_mmt - first_mmt ** 2)
    axes[0].plot(data, label="Original")
    axes[0].plot(pd.Series(rolling_mean, index=data.index), label="Rolling Mean")
    if y_lim_inf != 0 & y_lim_sup != 0:
      plt.ylim(y_lim_inf,y_lim_sup)
    axes[0].set_title(f"{pattern} by match\n" + f"Rolling Stats with Window Size = {wd_size} Matches")
    axes[0].legend()

    log_ret = np.log(data).diff().dropna()
    first_mmt = calc_wma(log_ret, wd_size)
    second_mmt = calc_wma(log_ret ** 2, wd_size)
    rolling_mean = first_mmt
    rolling_std = np.sqrt(second_mmt - first_mmt ** 2)
    axes[1].plot(log_ret, label="Log Return")
    axes[1].plot(pd.Series(rolling_mean, index=log_ret.index), label="Rolling Mean")
    axes[1].plot(pd.Series(rolling_std, index=log_ret.index), label="Rolling Std")

    axes[1].set_xlabel("Match")
    axes[1].set_title(f"Pattern by match\n" + f"Rolling Stats with Window Size = {wd_size} Matchs")
    axes[1].legend()
  else:
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.set_xlabel("Match")
    wd_size = window_size
    first_mmt = calc_wma(data, wd_size)
    second_mmt = calc_wma((data) ** 2, wd_size)
    rolling_mean = first_mmt
    rolling_std = np.sqrt(second_mmt - first_mmt ** 2)
    ax.plot(data, label="Original")
    ax.plot(pd.Series(rolling_mean, index=data.index), label="Rolling Mean")
    if y_lim_inf != 0 & y_lim_sup != 0:
      plt.ylim(y_lim_inf,y_lim_sup)
    ax.set_title(f"{pattern} by match\n" + f"Rolling Stats with Window Size = {wd_size} Matches")
    ax.legend()

In [None]:
draw_game_pattern(score_game['2PT']+score_game['3PT'], 400, "Total Attempts", 150, 350)

In [None]:
draw_game_pattern((score_game['2PT_make'] + score_game['3PT_make'])/(score_game['2PT']+score_game['3PT']), 400, "Shooting Percentage", 30, 60)

In [None]:
draw_game_pattern(score_game['Layup']+score_game['Finger Roll'], 400, "Layup& Finger Roll Attempt", 30, 60)

In [None]:
draw_game_pattern((score_game['Layup_make'] + score_game['Finger Roll_make'])/(score_game['Layup']+score_game['Finger Roll']), 400, "Layup& Finger Roll Percentage", 0, 0)

In [None]:
draw_game_pattern((score_game['Layup'] + score_game['Finger Roll'])/(score_game['make'] + score_game['miss'] - score_game['FT']), 400, "Layup Scale", 30, 60)

In [None]:
draw_game_pattern(score_game['2PT'], 400, "2 Points Shooting Attempt", 30, 60)
draw_game_pattern(score_game['2PT_make']/score_game['2PT'], 400, "2 Points Shooting Percentage", 0, 0)
draw_game_pattern(score_game['2PT']/(score_game['2PT'] + score_game['3PT']), 400, "2 Points Shooting Scale", 30, 60)

In [None]:
draw_game_pattern(score_game['3PT'], 400, "3 Points Shooting Attempt", 30, 60)
draw_game_pattern(score_game['3PT_make']/score_game['3PT'], 400, "3 Points Shooting Percentage", 0, 0)
draw_game_pattern(score_game['3PT']/(score_game['2PT'] + score_game['3PT']), 400, "3 Points Shooting Scale", 30, 60)

## Shooting distance by game

In [None]:
distance = df[['GAME_ID', 'make', 'miss', 'player_name', 'Shot', 'Fadeaway', 'Shot_shot_distance', 'Fadeaway_shot_distance', '2PT', '3PT', '2PT_make', '2PT_miss', '3PT_make', '3PT_miss', 'position']].copy()
distance = distance[(distance['Shot']==1) | (distance['Fadeaway']==1)]

In [None]:
distance['Shot_distance'] = distance['Shot_shot_distance']
distance['make_distance'] = distance['Shot_distance'] * distance['make']
distance['miss_distance'] = distance['Shot_distance'] * distance['miss']
distance['2PT_make_distance'] = distance['Shot_distance'] * distance['make'] * distance['2PT']
distance['2PT_miss_distance'] = distance['Shot_distance'] * distance['miss'] * distance['2PT']
distance['3PT_make_distance'] = distance['Shot_distance'] * distance['make'] * distance['3PT']
distance['3PT_miss_distance'] = distance['Shot_distance'] * distance['miss'] * distance['3PT']

In [None]:
# remove those error statistics about the shot distance
distance.drop(547745,inplace=True)
distance.drop(569570,inplace=True)
distance.drop(525706,inplace=True)


In [None]:
filtered_df = distance[distance['make_distance'] != 0]
distance_game = filtered_df.groupby('GAME_ID')['make_distance'].mean().reset_index()
filtered_df = distance[distance['miss_distance'] != 0]
distance_game = distance_game.merge(filtered_df.groupby('GAME_ID')['miss_distance'].mean().reset_index())
filtered_df = distance[distance['2PT_make_distance'] != 0]
distance_game = distance_game.merge(filtered_df.groupby('GAME_ID')['2PT_make_distance'].mean().reset_index())
filtered_df = distance[distance['2PT_miss_distance'] != 0]
distance_game = distance_game.merge(filtered_df.groupby('GAME_ID')['2PT_miss_distance'].mean().reset_index())
filtered_df = distance[distance['3PT_make_distance'] != 0]
distance_game = distance_game.merge(filtered_df.groupby('GAME_ID')['3PT_make_distance'].mean().reset_index())
filtered_df = distance[distance['3PT_miss_distance'] != 0]
distance_game = distance_game.merge(filtered_df.groupby('GAME_ID')['3PT_miss_distance'].mean().reset_index())
distance_game = distance_game.merge(distance.groupby('GAME_ID')['Shot_distance'].mean().reset_index())

In [None]:
draw_game_pattern(distance_game['Shot_distance'], 400, "Shooting Distance", 30, 60)

In [None]:
draw_game_pattern(distance_game['make_distance'], 400, "Shooting Made Distance", 10, 30)

In [None]:
draw_game_pattern(distance_game['miss_distance'], 400, "Shooting Miss Distance", 0, 0)

In [None]:
draw_game_pattern(distance_game['2PT_make_distance'], 400, "2 Points Shooting Made Distance", 0, 0)

In [None]:
draw_game_pattern(distance_game['2PT_miss_distance'], 400, "2 Points Shooting Miss Distance", 0, 0)

In [None]:
draw_game_pattern(distance_game['3PT_make_distance'], 400, "3 Points Shooting Made Distance", 0, 0)

In [None]:
draw_game_pattern(distance_game['3PT_miss_distance'], 400, "3 Points Shooting Miss Distance", 0, 0)

## Shooting distance analysis by position in game by game data

In [None]:
shooting_position = distance.copy()
shooting_position['position'] = shooting_position['position'].dropna().apply(lambda x: x.split('-')[0])

In [None]:
def groupby_position(dataframe, position):

  df = dataframe[dataframe['position']==position]
  filtered_df = df[df['make_distance'] != 0]
  distance_game_position = filtered_df.groupby('GAME_ID')['make_distance'].mean()
  distance_game_position = distance_game_position.reindex(dataframe['GAME_ID'].unique(), fill_value=0)
  distance_game_position = distance_game_position.reset_index()
  filtered_df = df[df['miss_distance'] != 0]
  distance_game_position = distance_game_position.merge(filtered_df.groupby('GAME_ID')['miss_distance'].mean().reset_index(), how="outer").fillna(0)
  filtered_df = df[df['2PT_make_distance'] != 0]
  distance_game_position = distance_game_position.merge(filtered_df.groupby('GAME_ID')['2PT_make_distance'].mean().reset_index(), how="outer").fillna(0)
  filtered_df = df[df['2PT_miss_distance'] != 0]
  distance_game_position = distance_game_position.merge(filtered_df.groupby('GAME_ID')['2PT_miss_distance'].mean().reset_index(), how="outer").fillna(0)
  filtered_df = df[df['3PT_make_distance'] != 0]
  distance_game_position = distance_game_position.merge(filtered_df.groupby('GAME_ID')['3PT_make_distance'].mean().reset_index(), how="outer").fillna(0)
  filtered_df = df[df['3PT_miss_distance'] != 0]
  distance_game_position = distance_game_position.merge(filtered_df.groupby('GAME_ID')['3PT_miss_distance'].mean().reset_index(), how="outer").fillna(0)
  distance_game_position = distance_game_position.merge(df.groupby('GAME_ID')['Shot_distance'].mean().reset_index(), how="outer").fillna(0)
  return distance_game_position


In [None]:
center_distance = groupby_position(distance, "C")
forward_distance = groupby_position(distance, "F")
guard_distance = groupby_position(distance, "G")

In [None]:
draw_game_pattern(center_distance['3PT_make_distance'], 200, "Center Three points shooting distance")

In [None]:
draw_game_pattern(center_distance['make_distance'], 200, "Center Shooting Distance")

In [None]:
draw_game_pattern(forward_distance['make_distance'], 200, "Forward shooting distance")

In [None]:
draw_game_pattern(guard_distance['make_distance'], 200, "Guard shooting distance")