In [215]:
import torch
import numpy as np
import pandas as pd
import requests
import json


## Data Cleaning and Exploration

In [372]:
##Features to feed into NN
feature_list = [
    'AwayScore', 'HomeScore','GameTimeLeft', 'HomeFinalScore', 'AwayFinalScore', 'HomeWin',
    
    'hour_sin', 'hour_cos', 'playoff', 'regular', 'AwayPlayOneHot', 'HomePlayOneHot', 'AwayAssist', 'HomeAssist',
    'Home2-pt dunk', 'Away2-pt dunk', 'Home2-pt hook shot', 'Away2-pt hook shot', 'Home2-pt jump shot', 
    'Away2-pt jump shot', 'Home2-pt layup', 'Away2-pt layup', 'Home3-pt jump shot', 'Away3-pt jump shot', 
    'HomeMade2-pt dunk', 'AwayMade2-pt dunk', 'HomeMade2-pt hook shot', 'AwayMade2-pt hook shot', 'HomeMade2-pt jump shot', 
    'AwayMade2-pt jump shot', 'HomeMade2-pt layup', 'AwayMade2-pt layup', 'HomeMade3-pt jump shot', 'AwayMade3-pt jump shot',
    'AwayDRB', 'HomeDRB', 'AwayORB', 'HomeORB', 'AwayFoul', 'HomeFoul', 'AwayBlock', 'HomeBlock', 'AwayViolation', 
    'HomeViolation', 'AwayTurnover', 'HomeTurnover', 'AwayTimeout', 'HomeTimeout', 'AwayFreeThrow', 'HomeFreeThrow', 
    'AwayMadeFreeThrow', 'HomeMadeFreeThrow', 'AwayWonJumpball', 'HomeWonJumpball', 'AwaySub', 'HomeSub', 'GameTimeLeft',
    'AwayAssistTotal', 'HomeAssistTotal', 'Home2-pt dunkTotal', 'Away2-pt dunkTotal', 'Home2-pt hook shotTotal', 
    'Away2-pt hook shotTotal', 'Home2-pt jump shotTotal', 'Away2-pt jump shotTotal', 'Home2-pt layupTotal',
    'Away2-pt layupTotal', 'Home3-pt jump shotTotal', 'Away3-pt jump shotTotal', 'HomeMade2-pt dunkTotal', 
    'AwayMade2-pt dunkTotal', 'HomeMade2-pt hook shotTotal', 'AwayMade2-pt hook shotTotal', 'HomeMade2-pt jump shotTotal',
    'AwayMade2-pt jump shotTotal', 'HomeMade2-pt layupTotal', 'AwayMade2-pt layupTotal', 'HomeMade3-pt jump shotTotal', 
    'AwayMade3-pt jump shotTotal', 'AwayDRBTotal', 'HomeDRBTotal', 'AwayORBTotal', 'HomeORBTotal', 'AwayFoulTotal', 
    'HomeFoulTotal', 'AwayBlockTotal', 'HomeBlockTotal', 'AwayViolationTotal', 'HomeViolationTotal', 'AwayTurnoverTotal',
    'HomeTurnoverTotal', 'AwayTimeoutTotal', 'HomeTimeoutTotal', 'AwayFreeThrowTotal', 'HomeFreeThrowTotal',
    'AwayMadeFreeThrowTotal','HomeMadeFreeThrowTotal', 'AwayWonJumpballTotal', 'HomeWonJumpballTotal', 'AwaySubTotal', 
    'HomeSubTotal','Rk_x', 'Age_x', 'W_x', 'L_x', 'PW_x', 'PL_x', 'MOV_x', 
    'SOS_x', 'SRS_x', 'ORtg_x', 'DRtg_x', 'NRtg_x', 'Pace_x', 'FTr_x', '3PAr_x', 'TS%_x', 'eFG%_x', 'TOV%_x', 
    'ORB%_x', 'FT/FGA_x', 'eFG%.1_x', 'TOV%.1_x', 'DRB%_x', 'FT/FGA.1_x', 'Rk_y', 'Age_y', 'W_y', 'L_y', 
    'PW_y', 'PL_y', 'MOV_y', 'SOS_y', 'SRS_y', 'ORtg_y', 'DRtg_y', 'NRtg_y', 'Pace_y', 'FTr_y', '3PAr_y', 'TS%_y', 
    'eFG%_y', 'TOV%_y', 'ORB%_y', 'FT/FGA_y', 'eFG%.1_y', 'TOV%.1_y', 'DRB%_y', 'FT/FGA.1_y'
]
def process_df(path):
    df = pd.read_csv(path,  parse_dates=['Date','Time'])
    print('Finished reading in CSV')
    print(df.columns)
    ##Encoding time cyclicly 
    df['hour'] = df["Time"].dt.hour
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/23.0)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/23.0)

    ##Encoding game type
    game_type_one_hot = pd.get_dummies(df['GameType'])
    df = df.join(game_type_one_hot)

    ##In general it seems better if we make a play than if they make one so let's encode AwayPlay vs HomePlay
    ##0 if NaN and 1 if has some value
    df['AwayPlayOneHot'] = df['AwayPlay'].notnull().astype(int)
    df['HomePlayOneHot'] = df['HomePlay'].notnull().astype(int)

    ## 1 if assist for certain team 0 otherwise
    df['AwayAssist'] = df['Assister'].notnull().astype(int) * df['AwayPlayOneHot']
    df['HomeAssist'] = df['Assister'].notnull().astype(int) * df['HomePlayOneHot']

    ##Split the shots into features for home and away team
    shot_type_oh = pd.get_dummies(df['ShotType'])
    for col in shot_type_oh.columns:
        df["Home" + col] = shot_type_oh[col] * df['HomePlayOneHot']
        df["Away" + col] = shot_type_oh[col] * df['AwayPlayOneHot']

    shot_made_oh = pd.get_dummies(df['ShotOutcome'])
    for col in shot_type_oh.columns:
        df["HomeMade" + col] = df["Home" + col] * shot_made_oh["make"]
        df["AwayMade" + col] = df["Away" + col] * shot_made_oh["make"]

    ## Same as assists above but for RBS  - also differentiate DRB and ORB
    rbs_type_oh = pd.get_dummies(df['ReboundType'])
    drbs = df['Rebounder'].notnull().astype(int) * rbs_type_oh["defensive"]
    orbs = df['Rebounder'].notnull().astype(int) * rbs_type_oh["offensive"]
    df['AwayDRB'] = drbs * df['AwayPlayOneHot']
    df['HomeDRB'] = drbs * df['HomePlayOneHot']

    df['AwayORB'] = orbs * df['AwayPlayOneHot']
    df['HomeORB'] = orbs * df['HomePlayOneHot']


    ## away foul is 1 if the away team committed a foul, and is 0 otherwise
    df['AwayFoul'] = df['Fouler'].notnull().astype(int) * df['AwayPlayOneHot']
    df['HomeFoul'] = df['Fouler'].notnull().astype(int) * df['HomePlayOneHot']

    ## same as above for blocks
    df['AwayBlock'] = df['Blocker'].notnull().astype(int) * df['AwayPlayOneHot']
    df['HomeBlock'] = df['Blocker'].notnull().astype(int) * df['HomePlayOneHot']

    ## same as above for violation
    df["AwayViolation"] = df["ViolationPlayer"].notnull().astype(int) * df["AwayPlayOneHot"]
    df["HomeViolation"] = df["ViolationPlayer"].notnull().astype(int) * df["HomePlayOneHot"]

    ## same as above for turnover
    df["AwayTurnover"] = df["TurnoverPlayer"].notnull().astype(int) * df["AwayPlayOneHot"]
    df["HomeTurnover"] = df["TurnoverPlayer"].notnull().astype(int) * df["HomePlayOneHot"]

    ## same as above for timeout
    df["AwayTimeout"] = df["TimeoutTeam"].notnull().astype(int) * df["AwayPlayOneHot"]
    df["HomeTimeout"] = df["TimeoutTeam"].notnull().astype(int) * df["HomePlayOneHot"]

    ## same as shot for free throw
    df['AwayFreeThrow'] = df['FreeThrowShooter'].notnull().astype(int) * df["AwayPlayOneHot"]
    df['HomeFreeThrow'] = df['FreeThrowShooter'].notnull().astype(int) * df["HomePlayOneHot"]

    ft_made_oh = pd.get_dummies(df['FreeThrowOutcome'])
    df['AwayMadeFreeThrow'] = df['AwayFreeThrow'] * ft_made_oh['make']
    df['HomeMadeFreeThrow'] = df['HomeFreeThrow'] * ft_made_oh['make']
    
    ## same for jumpball
    df['AwayWonJumpball'] = df['JumpballPoss'].notnull().astype(int) * df['AwayPlayOneHot'][1:].append(pd.Series([0]))
    df['HomeWonJumpball'] = df['JumpballPoss'].notnull().astype(int) * df['HomePlayOneHot'][1:].append(pd.Series([0])).reset_index().drop(['index'],1)[0]
    
    ## Same for substitution
    df['AwaySub'] = df['EnterGame'].notnull().astype(int) * df['AwayPlayOneHot']
    df['HomeSub'] = df['EnterGame'].notnull().astype(int) * df['HomePlayOneHot']
    
    ##Normalize other columns that we are keeping
    df['GameTimeLeft'] = (48 * 60) - df['Quarter'] * (12 * 60) + df['SecLeft'] ## doesn't account for OT
    df['GameTimeLeft'] /= 2880
    
    ##Temp using mean 100 std deviation 15 for final scores - somewhat close to real averages esp across older data
    df['AwayScore'] = (df['AwayScore'] - 100) / 15
    df['HomeScore'] = (df['HomeScore'] - 100) / 15

    ##Aggregate game statistics
    df = df.sort_values(by=['Date','HomeTeam','GameTimeLeft'], ascending=[True,True,False], kind='mergesort').reset_index()
    df.drop(['index'], 1, inplace=True)
    to_accumulate = [
        'AwayAssist', 'HomeAssist', 'Home2-pt dunk', 'Away2-pt dunk', 'Home2-pt hook shot', 'Away2-pt hook shot', 
        'Home2-pt jump shot', 'Away2-pt jump shot', 'Home2-pt layup', 'Away2-pt layup', 'Home3-pt jump shot', 
        'Away3-pt jump shot', 'HomeMade2-pt dunk', 'AwayMade2-pt dunk', 'HomeMade2-pt hook shot', 
        'AwayMade2-pt hook shot', 'HomeMade2-pt jump shot', 'AwayMade2-pt jump shot', 
        'HomeMade2-pt layup', 'AwayMade2-pt layup', 'HomeMade3-pt jump shot', 'AwayMade3-pt jump shot', 
        'AwayDRB', 'HomeDRB', 'AwayORB', 'HomeORB','AwayFoul', 'HomeFoul', 'AwayBlock', 'HomeBlock', 
        'AwayViolation', 'HomeViolation', 'AwayTurnover', 'HomeTurnover', 'AwayTimeout', 
        'HomeTimeout', 'AwayFreeThrow', 'HomeFreeThrow', 'AwayMadeFreeThrow', 
        'HomeMadeFreeThrow', 'AwayWonJumpball', 'HomeWonJumpball', 'AwaySub', 'HomeSub'
    ]
    
    accumulated = []
    for col in to_accumulate:
        df[col + "Total"] = df.groupby(['URL'])[col].cumsum()
        accumulated.append(col + "Total")
    for col in accumulated:
        mean = np.mean(df.groupby(['URL'])[col].max())
        std = np.std(df.groupby(['URL'])[col].max())
        df[col] = (df[col] - mean) / std
        
    ## add ground truth labels we will try to predict
    finalHomeScores = df.groupby(['URL'])["HomeScore"].max()
    finalAwayScores = df.groupby(['URL'])["AwayScore"].max()

    sizes = df.groupby(['URL']).size()
    homeScoresCol = np.array([])
    awayScoresCol = np.array([])
    for i in range(len(sizes)):
        homeScoresCol = np.concatenate((homeScoresCol, np.full(sizes[i], finalHomeScores[i])))
        awayScoresCol = np.concatenate((awayScoresCol, np.full(sizes[i], finalAwayScores[i])))

    df['HomeFinalScore'] = homeScoresCol
    df['AwayFinalScore'] = awayScoresCol
    
    df['HomeWin'] = (df['HomeFinalScore'] > df['AwayFinalScore']).astype(int)
    
    print('Finished processsing PBP Data')
    return df

def add_team_stats(df, team_path):
    print('Adding Team Stats')
    teams_df = pd.read_csv(team_path)
    abbrevs = []
    teams = []
    with open('teams.json', 'r') as f:
        teams = json.load(f)
    for teamname in teams_df['Team']:
        if teamname[-1] == '*':
            teamname = teamname[:-1]
        for i in range(len(teams)):
            if teams[i]['teamName'] == teamname:
                abbrev = teams[i]['abbreviation']
                if abbrev == 'BKN':
                    abbrevs.append('BRK')
                elif abbrev == 'PHX':
                    abbrevs.append('PHO')
                elif abbrev == 'CHA':
                    abbrevs.append('CHO')
                else:
                    abbrevs.append(teams[i]['abbreviation'])
    teams_df['Names'] = abbrevs
    teams_df.drop(['Team','Arena','Attend.','Attend./G'],1, inplace=True)
    columns = teams_df.columns
    df = df.merge(teams_df,left_on = 'AwayTeam',right_on = 'Names',how = 'left').reset_index()
    df = df.merge(teams_df,left_on = 'HomeTeam',right_on = 'Names',how = 'left').reset_index()
    for col in columns:
        if col == "Names":
            continue
        mean = np.mean(teams_df[col])
        std = np.std(teams_df[col])
        df[col + '_x'] = (df[col + '_x'] - mean) / std
        df[col + '_y'] = (df[col + '_y'] - mean) / std
    return df

In [373]:
df = process_df("NBA_PBP_2019-20.csv")
df = add_team_stats(df, "team2019.csv")
df.drop(df.columns.difference(feature_list+ ['HomeTeam','AwayTeam','Date']), 1, inplace=True)

Finished reading in CSV
Index(['URL', 'GameType', 'Location', 'Date', 'Time', 'WinningTeam', 'Quarter',
       'SecLeft', 'AwayTeam', 'AwayPlay', 'AwayScore', 'HomeTeam', 'HomePlay',
       'HomeScore', 'Shooter', 'ShotType', 'ShotOutcome', 'ShotDist',
       'Assister', 'Blocker', 'FoulType', 'Fouler', 'Fouled', 'Rebounder',
       'ReboundType', 'ViolationPlayer', 'ViolationType', 'TimeoutTeam',
       'FreeThrowShooter', 'FreeThrowOutcome', 'FreeThrowNum', 'EnterGame',
       'LeaveGame', 'TurnoverPlayer', 'TurnoverType', 'TurnoverCause',
       'TurnoverCauser', 'JumpballAwayPlayer', 'JumpballHomePlayer',
       'JumpballPoss', 'Unnamed: 40'],
      dtype='object')
Finished processsing PBP Data
Adding Team Stats


In [376]:
df.to_csv("2019pbpfeatures.csv",index=False)

In [371]:
df['HomePlayOneHot'][1:].append(pd.Series([0])).reset_index().drop(['index'],1)[0]


0         0
1         1
2         0
3         0
4         1
         ..
539260    0
539261    1
539262    0
539263    0
539264    0
Name: 0, Length: 539265, dtype: int64

In [375]:
df.isna().sum()

Date          0
AwayTeam      0
AwayScore     0
HomeTeam      0
HomeScore     0
             ..
FT/FGA_y      0
eFG%.1_y      0
TOV%.1_y      0
DRB%_y        0
FT/FGA.1_y    0
Length: 151, dtype: int64

In [377]:
df

Unnamed: 0,Date,AwayTeam,AwayScore,HomeTeam,HomeScore,hour_sin,hour_cos,playoff,regular,AwayPlayOneHot,...,3PAr_y,TS%_y,eFG%_y,TOV%_y,ORB%_y,FT/FGA_y,eFG%.1_y,TOV%.1_y,DRB%_y,FT/FGA.1_y
0,2019-10-22,LAL,-6.666667,LAC,-6.666667,-0.269797,0.962917,0,1,1,...,-0.215859,0.896664,0.479271,-0.271315,0.653578,1.786168,-1.383920,-0.537824,0.081352,0.227588
1,2019-10-22,LAL,-6.533333,LAC,-6.666667,-0.269797,0.962917,0,1,1,...,-0.215859,0.896664,0.479271,-0.271315,0.653578,1.786168,-1.383920,-0.537824,0.081352,0.227588
2,2019-10-22,LAL,-6.533333,LAC,-6.666667,-0.269797,0.962917,0,1,0,...,-0.215859,0.896664,0.479271,-0.271315,0.653578,1.786168,-1.383920,-0.537824,0.081352,0.227588
3,2019-10-22,LAL,-6.533333,LAC,-6.666667,-0.269797,0.962917,0,1,1,...,-0.215859,0.896664,0.479271,-0.271315,0.653578,1.786168,-1.383920,-0.537824,0.081352,0.227588
4,2019-10-22,LAL,-6.333333,LAC,-6.666667,-0.269797,0.962917,0,1,1,...,-0.215859,0.896664,0.479271,-0.271315,0.653578,1.786168,-1.383920,-0.537824,0.081352,0.227588
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539260,2020-10-11,LAL,0.200000,MIA,-0.666667,-0.887885,0.460065,1,0,0,...,0.851311,1.600851,1.341957,0.805965,-1.396862,1.841928,-0.379938,-0.190841,1.240617,0.566550
539261,2020-10-11,LAL,0.400000,MIA,-0.666667,-0.887885,0.460065,1,0,1,...,0.851311,1.600851,1.341957,0.805965,-1.396862,1.841928,-0.379938,-0.190841,1.240617,0.566550
539262,2020-10-11,LAL,0.400000,MIA,-0.466667,-0.887885,0.460065,1,0,0,...,0.851311,1.600851,1.341957,0.805965,-1.396862,1.841928,-0.379938,-0.190841,1.240617,0.566550
539263,2020-10-11,LAL,0.400000,MIA,-0.466667,-0.887885,0.460065,1,0,1,...,0.851311,1.600851,1.341957,0.805965,-1.396862,1.841928,-0.379938,-0.190841,1.240617,0.566550


In [333]:
for name in pd.get_dummies(df['AwayTeam']).columns:
    if name not in set(teams_df['Names']):
        print(name)

BRK
CHO
PHO
