In [2]:
import torch
import numpy as np
import pandas as pd

## Data Cleaning and Exploration

In [108]:
##Features to feed into NN
feature_list = [
        'AwayScore', 'Home Score', 'SecLeft', 'Quarter',
       'hour_sin', 'hour_cos',
       'playoff', 'regular', 'AwayPlayOneHot', 'HomePlayOneHot', 'AwayAssist',
       'HomeAssist', 'Home2-pt dunk', 'Away2-pt dunk', 'Home2-pt hook shot',
       'Away2-pt hook shot', 'Home2-pt jump shot', 'Away2-pt jump shot',
       'Home2-pt layup', 'Away2-pt layup', 'Home3-pt jump shot',
       'Away3-pt jump shot', 'HomeMade2-pt dunk', 'AwayMade2-pt dunk',
       'HomeMade2-pt hook shot', 'AwayMade2-pt hook shot',
       'HomeMade2-pt jump shot', 'AwayMade2-pt jump shot',
       'HomeMade2-pt layup', 'AwayMade2-pt layup', 'HomeMade3-pt jump shot',
       'AwayMade3-pt jump shot', 'AwayDRB', 'HomeDRB', 'AwayORB', 'HomeORB',
       'AwayFoul', 'HomeFoul', 'AwayBlock', 'HomeBlock', 'AwayViolation',
       'HomeViolation', 'AwayTurnover', 'HomeTurnover', 'AwayTimeout',
       'HomeTimeout', 'AwayFreeThrow', 'HomeFreeThrow', 'AwayMadeFreeThrow',
       'HomeMadeFreeThrow', 'AwayWonJumpball', 'HomeWonJumpball', 'AwaySub',
       'HomeSub'
]
def process_df(path):
    df = pd.read_csv(path,  parse_dates=['Time'])

    print(df.columns)
    ##Encoding time cyclicly 
    df['hour'] = df["Time"].dt.hour
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/23.0)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/23.0)

    ##Encoding game type
    game_type_one_hot = pd.get_dummies(df['GameType'])
    df = df.join(game_type_one_hot)

    ##In general it seems better if we make a play than if they make one so let's encode AwayPlay vs HomePlay
    ##0 if NaN and 1 if has some value
    df['AwayPlayOneHot'] = df['AwayPlay'].notnull().astype(int)
    df['HomePlayOneHot'] = df['HomePlay'].notnull().astype(int)

    ## 1 if assist for certain team 0 otherwise
    df['AwayAssist'] = df['Assister'].notnull().astype(int) * df['AwayPlayOneHot']
    df['HomeAssist'] = df['Assister'].notnull().astype(int) * df['HomePlayOneHot']

    ##Split the shots into features for home and away team
    shot_type_oh = pd.get_dummies(df['ShotType'])
    for col in shot_type_oh.columns:
        df["Home" + col] = shot_type_oh[col] * df['HomePlayOneHot']
        df["Away" + col] = shot_type_oh[col] * df['AwayPlayOneHot']

    shot_made_oh = pd.get_dummies(df['ShotOutcome'])
    for col in shot_type_oh.columns:
        df["HomeMade" + col] = df["Home" + col] * shot_made_oh["make"]
        df["AwayMade" + col] = df["Away" + col] * shot_made_oh["make"]

    ## Same as assists above but for RBS  - also differentiate DRB and ORB
    rbs_type_oh = pd.get_dummies(df['ReboundType'])
    drbs = df['Rebounder'].notnull().astype(int) * rbs_type_oh["defensive"]
    orbs = df['Rebounder'].notnull().astype(int) * rbs_type_oh["offensive"]
    df['AwayDRB'] = drbs * df['AwayPlayOneHot']
    df['HomeDRB'] = drbs * df['HomePlayOneHot']

    df['AwayORB'] = orbs * df['AwayPlayOneHot']
    df['HomeORB'] = orbs * df['HomePlayOneHot']


    ## away foul is 1 if the away team committed a foul, and is 0 otherwise
    df['AwayFoul'] = df['Fouler'].notnull().astype(int) * df['AwayPlayOneHot']
    df['HomeFoul'] = df['Fouler'].notnull().astype(int) * df['HomePlayOneHot']

    ## same as above for blocks
    df['AwayBlock'] = df['Blocker'].notnull().astype(int) * df['AwayPlayOneHot']
    df['HomeBlock'] = df['Blocker'].notnull().astype(int) * df['HomePlayOneHot']

    ## same as above for violation
    df["AwayViolation"] = df["ViolationPlayer"].notnull().astype(int) * df["AwayPlayOneHot"]
    df["HomeViolation"] = df["ViolationPlayer"].notnull().astype(int) * df["HomePlayOneHot"]

    ## same as above for turnover
    df["AwayTurnover"] = df["TurnoverPlayer"].notnull().astype(int) * df["AwayPlayOneHot"]
    df["HomeTurnover"] = df["TurnoverPlayer"].notnull().astype(int) * df["HomePlayOneHot"]

    ## same as above for timeout
    df["AwayTimeout"] = df["TimeoutTeam"].notnull().astype(int) * df["AwayPlayOneHot"]
    df["HomeTimeout"] = df["TimeoutTeam"].notnull().astype(int) * df["HomePlayOneHot"]

    ## same as shot for free throw
    df['AwayFreeThrow'] = df['FreeThrowShooter'].notnull().astype(int) * df["AwayPlayOneHot"]
    df['HomeFreeThrow'] = df['FreeThrowShooter'].notnull().astype(int) * df["HomePlayOneHot"]

    ft_made_oh = pd.get_dummies(df['FreeThrowOutcome'])
    df['AwayMadeFreeThrow'] = df['AwayFreeThrow'] * ft_made_oh['make']
    df['HomeMadeFreeThrow'] = df['HomeFreeThrow'] * ft_made_oh['make']
    
    ## same for jumpball
    df['AwayWonJumpball'] = df['JumpballPoss'].notnull().astype(int) * df['AwayPlayOneHot']
    df['HomeWonJumpball'] = df['JumpballPoss'].notnull().astype(int) * df['HomePlayOneHot']
    
    ## Same for substitution
    df['AwaySub'] = df['EnterGame'].notnull().astype(int) * df['AwayPlayOneHot']
    df['HomeSub'] = df['EnterGame'].notnull().astype(int) * df['HomePlayOneHot']
    
    ##Normalize other columns that we are keeping
    df['SecLeft'] = df['SecLeft'] / 720
    df['Quarter'] = df['Quarter'] / 4             ##max quarters is 6 for 2OT but mean should be around 1
    
    ##Temp using mean 100 std deviation 15 for final scores - somewhat close to real averages esp across older data
    df['AwayScore'] = (df['AwayScore'] - 100) / 15
    df['HomeScore'] = (df['HomeScore'] - 100) / 15
    df.drop(df.columns.difference(feature_list), 1, inplace=True)
    return df
df = process_df("NBA_PBP_2019-20.csv")

Index(['URL', 'GameType', 'Location', 'Date', 'Time', 'WinningTeam', 'Quarter',
       'SecLeft', 'AwayTeam', 'AwayPlay', 'AwayScore', 'HomeTeam', 'HomePlay',
       'HomeScore', 'Shooter', 'ShotType', 'ShotOutcome', 'ShotDist',
       'Assister', 'Blocker', 'FoulType', 'Fouler', 'Fouled', 'Rebounder',
       'ReboundType', 'ViolationPlayer', 'ViolationType', 'TimeoutTeam',
       'FreeThrowShooter', 'FreeThrowOutcome', 'FreeThrowNum', 'EnterGame',
       'LeaveGame', 'TurnoverPlayer', 'TurnoverType', 'TurnoverCause',
       'TurnoverCauser', 'JumpballAwayPlayer', 'JumpballHomePlayer',
       'JumpballPoss', 'Unnamed: 40'],
      dtype='object')


In [103]:
pd.get_dummies(df['Quarter'])

Unnamed: 0,1,2,3,4,5,6
0,1,0,0,0,0,0
1,1,0,0,0,0,0
2,1,0,0,0,0,0
3,1,0,0,0,0,0
4,1,0,0,0,0,0
...,...,...,...,...,...,...
539260,0,0,0,1,0,0
539261,0,0,0,1,0,0
539262,0,0,0,1,0,0
539263,0,0,0,1,0,0
