In [2]:
import torch
import numpy as np
import pandas as pd
import requests
import json


## Data Cleaning and Exploration

In [34]:
##Features to feed into NN
feature_list = [
    'AwayScore', 'HomeScore','GameTimeLeft', 'HomeFinalScore', 'AwayFinalScore', 'HomeWin',
    
    'hour_sin', 'hour_cos', 'playoff', 'regular', 'AwayPlayOneHot', 'HomePlayOneHot', 'AwayAssist', 'HomeAssist',
    'Home2-pt dunk', 'Away2-pt dunk', 'Home2-pt hook shot', 'Away2-pt hook shot', 'Home2-pt jump shot', 
    'Away2-pt jump shot', 'Home2-pt layup', 'Away2-pt layup', 'Home3-pt jump shot', 'Away3-pt jump shot', 
    'HomeMade2-pt dunk', 'AwayMade2-pt dunk', 'HomeMade2-pt hook shot', 'AwayMade2-pt hook shot', 'HomeMade2-pt jump shot', 
    'AwayMade2-pt jump shot', 'HomeMade2-pt layup', 'AwayMade2-pt layup', 'HomeMade3-pt jump shot', 'AwayMade3-pt jump shot',
    'AwayDRB', 'HomeDRB', 'AwayORB', 'HomeORB', 'AwayFoul', 'HomeFoul', 'AwayBlock', 'HomeBlock', 'AwayViolation', 
    'HomeViolation', 'AwayTurnover', 'HomeTurnover', 'AwayTimeout', 'HomeTimeout', 'AwayFreeThrow', 'HomeFreeThrow', 
    'AwayMadeFreeThrow', 'HomeMadeFreeThrow', 'AwayWonJumpball', 'HomeWonJumpball', 'AwaySub', 'HomeSub', 'GameTimeLeft',
    'AwayAssistTotal', 'HomeAssistTotal', 'Home2-pt dunkTotal', 'Away2-pt dunkTotal', 'Home2-pt hook shotTotal', 
    'Away2-pt hook shotTotal', 'Home2-pt jump shotTotal', 'Away2-pt jump shotTotal', 'Home2-pt layupTotal',
    'Away2-pt layupTotal', 'Home3-pt jump shotTotal', 'Away3-pt jump shotTotal', 'HomeMade2-pt dunkTotal', 
    'AwayMade2-pt dunkTotal', 'HomeMade2-pt hook shotTotal', 'AwayMade2-pt hook shotTotal', 'HomeMade2-pt jump shotTotal',
    'AwayMade2-pt jump shotTotal', 'HomeMade2-pt layupTotal', 'AwayMade2-pt layupTotal', 'HomeMade3-pt jump shotTotal', 
    'AwayMade3-pt jump shotTotal', 'AwayDRBTotal', 'HomeDRBTotal', 'AwayORBTotal', 'HomeORBTotal', 'AwayFoulTotal', 
    'HomeFoulTotal', 'AwayBlockTotal', 'HomeBlockTotal', 'AwayViolationTotal', 'HomeViolationTotal', 'AwayTurnoverTotal',
    'HomeTurnoverTotal', 'AwayTimeoutTotal', 'HomeTimeoutTotal', 'AwayFreeThrowTotal', 'HomeFreeThrowTotal',
    'AwayMadeFreeThrowTotal','HomeMadeFreeThrowTotal', 'AwayWonJumpballTotal', 'HomeWonJumpballTotal', 'AwaySubTotal', 
    'HomeSubTotal','Rk_x', 'Age_x', 'W_x', 'L_x', 'PW_x', 'PL_x', 'MOV_x', 
    'SOS_x', 'SRS_x', 'ORtg_x', 'DRtg_x', 'NRtg_x', 'Pace_x', 'FTr_x', '3PAr_x', 'TS%_x', 'eFG%_x', 'TOV%_x', 
    'ORB%_x', 'FT/FGA_x', 'eFG%.1_x', 'TOV%.1_x', 'DRB%_x', 'FT/FGA.1_x', 'Rk_y', 'Age_y', 'W_y', 'L_y', 
    'PW_y', 'PL_y', 'MOV_y', 'SOS_y', 'SRS_y', 'ORtg_y', 'DRtg_y', 'NRtg_y', 'Pace_y', 'FTr_y', '3PAr_y', 'TS%_y', 
    'eFG%_y', 'TOV%_y', 'ORB%_y', 'FT/FGA_y', 'eFG%.1_y', 'TOV%.1_y', 'DRB%_y', 'FT/FGA.1_y'
]
def process_df(df):
#     df = pd.read_csv(path,  parse_dates=['Date','Time'])
    print('Finished reading in CSV')
#     print(df.columns)
    cols = ['Date', 'URL', 'Location', 'Time', 'WinningTeam', 'Quarter',
       'SecLeft', 'AwayPlay', 'HomePlay','AwayTeam', 'AwayScore', 'HomeTeam',
       'HomeScore', 'ShotType', 'ShotOutcome', 'FoulType', 'Fouler', 'Fouled',
       'FreeThrowShooter', 'FreeThrowOutcome', 'FreeThrowNum', 'TurnoverPlayer', 'TurnoverType', 'TurnoverCause',
       'TurnoverCauser', 'Assister', 'ReboundType', 'Rebounder']
    
    df = df[cols]
    ##Encoding time cyclicly 
    df['hour'] = df["Time"].dt.hour
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/23.0)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/23.0)

#     ##Encoding game type
#     game_type_one_hot = pd.get_dummies(df['GameType'])
#     df = df.join(game_type_one_hot)

    ##In general it seems better if we make a play than if they make one so let's encode AwayPlay vs HomePlay
    ##0 if NaN and 1 if has some value
    df['AwayPlayOneHot'] = df['AwayPlay'].notnull().astype(int)
    df['HomePlayOneHot'] = df['HomePlay'].notnull().astype(int)

#     ## 1 if assist for certain team 0 otherwise
    df['AwayAssist'] = df['Assister'].notnull().astype(int) * df['AwayPlayOneHot']
    df['HomeAssist'] = df['Assister'].notnull().astype(int) * df['HomePlayOneHot']

#     ##Split the shots into features for home and away team
#     shot_type_oh = pd.get_dummies(df['ShotType'])
#     for col in shot_type_oh.columns:
#         df["Home" + col] = shot_type_oh[col] * df['HomePlayOneHot']
#         df["Away" + col] = shot_type_oh[col] * df['AwayPlayOneHot']
    
    df['2-pt shot'] = df['ShotType'].str.contains('2-pt').fillna(False).astype(int)
    df['3-pt shot'] = df['ShotType'].str.contains('3-pt').fillna(False).astype(int)
    
    for col in ['2-pt shot', '3-pt shot']:
        df["Home " + col] = df[col] * df['HomePlayOneHot']
        df["Away " + col] = df[col] * df['AwayPlayOneHot']
    
    shot_made_oh = pd.get_dummies(df['ShotOutcome'])
    for col in ['2-pt shot', '3-pt shot']:
        df["HomeMade " + col] = df["Home " + col] * shot_made_oh["make"]
        df["AwayMade " + col] = df["Away " + col] * shot_made_oh["make"]

#     ## Same as assists above but for RBS  - also differentiate DRB and ORB
    rbs_type_oh = pd.get_dummies(df['ReboundType'])
    drbs = df['Rebounder'].notnull().astype(int) * rbs_type_oh["defensive"]
    orbs = df['Rebounder'].notnull().astype(int) * rbs_type_oh["offensive"]
    df['AwayDRB'] = drbs * df['AwayPlayOneHot']
    df['HomeDRB'] = drbs * df['HomePlayOneHot']

    df['AwayORB'] = orbs * df['AwayPlayOneHot']
    df['HomeORB'] = orbs * df['HomePlayOneHot']


#     ## away foul is 1 if the away team committed a foul, and is 0 otherwise
    df['AwayFoul'] = df['Fouler'].notnull().astype(int) * df['AwayPlayOneHot']
    df['HomeFoul'] = df['Fouler'].notnull().astype(int) * df['HomePlayOneHot']

#     ## same as above for blocks
#     df['AwayBlock'] = df['Blocker'].notnull().astype(int) * df['AwayPlayOneHot']
#     df['HomeBlock'] = df['Blocker'].notnull().astype(int) * df['HomePlayOneHot']

#     ## same as above for violation
#     df["AwayViolation"] = df["ViolationPlayer"].notnull().astype(int) * df["AwayPlayOneHot"]
#     df["HomeViolation"] = df["ViolationPlayer"].notnull().astype(int) * df["HomePlayOneHot"]

    ## same as above for turnover
    df["AwayTurnover"] = df["TurnoverPlayer"].notnull().astype(int) * df["AwayPlayOneHot"]
    df["HomeTurnover"] = df["TurnoverPlayer"].notnull().astype(int) * df["HomePlayOneHot"]

#     ## same as above for timeout
#     df["AwayTimeout"] = df["TimeoutTeam"].notnull().astype(int) * df["AwayPlayOneHot"]
#     df["HomeTimeout"] = df["TimeoutTeam"].notnull().astype(int) * df["HomePlayOneHot"]

    ## same as shot for free throw
    df['AwayFreeThrow'] = df['FreeThrowShooter'].notnull().astype(int) * df["AwayPlayOneHot"]
    df['HomeFreeThrow'] = df['FreeThrowShooter'].notnull().astype(int) * df["HomePlayOneHot"]

    ft_made_oh = pd.get_dummies(df['FreeThrowOutcome'])
    df['AwayMadeFreeThrow'] = df['AwayFreeThrow'] * ft_made_oh['make']
    df['HomeMadeFreeThrow'] = df['HomeFreeThrow'] * ft_made_oh['make']
    
#     ## same for jumpball
#     df['AwayWonJumpball'] = df['JumpballPoss'].notnull().astype(int) * df['AwayPlayOneHot'][1:].append(pd.Series([0]))
#     df['HomeWonJumpball'] = df['JumpballPoss'].notnull().astype(int) * df['HomePlayOneHot'][1:].append(pd.Series([0])).reset_index().drop(['index'],1)[0]
    
#     ## Same for substitution
#     df['AwaySub'] = df['EnterGame'].notnull().astype(int) * df['AwayPlayOneHot']
#     df['HomeSub'] = df['EnterGame'].notnull().astype(int) * df['HomePlayOneHot']
    
    ##Normalize other columns that we are keeping
    df['GameTimeLeft'] = (48 * 60) - df['Quarter'] * (12 * 60) + df['SecLeft'] ## doesn't account for OT
#     df['GameTimeLeft'] /= 2880
    max_time = 48*60
    df['GameTimeLeftSin'] = df['GameTimeLeft'].apply(lambda x: np.sin(2 * np.pi*x / max_time))
    df['GameTimeLeftCos'] = df['GameTimeLeft'].apply(lambda x: np.cos(2 * np.pi*x / max_time))
    
    ##Temp using mean 100 std deviation 15 for final scores - somewhat close to real averages esp across older data
    df['AwayScore'] = (df['AwayScore'] - 100) / 15
    df['HomeScore'] = (df['HomeScore'] - 100) / 15

    ##Aggregate game statistics
    df = df.sort_values(by=['Date','HomeTeam','GameTimeLeft'], ascending=[True,True,False], kind='mergesort').reset_index()
    df.drop(['index'], 1, inplace=True)
#     to_accumulate = [
#         'AwayAssist', 'HomeAssist', 'Home2-pt dunk', 'Away2-pt dunk', 'Home2-pt hook shot', 'Away2-pt hook shot', 
#         'Home2-pt jump shot', 'Away2-pt jump shot', 'Home2-pt layup', 'Away2-pt layup', 'Home3-pt jump shot', 
#         'Away3-pt jump shot', 'HomeMade2-pt dunk', 'AwayMade2-pt dunk', 'HomeMade2-pt hook shot', 
#         'AwayMade2-pt hook shot', 'HomeMade2-pt jump shot', 'AwayMade2-pt jump shot', 
#         'HomeMade2-pt layup', 'AwayMade2-pt layup', 'HomeMade3-pt jump shot', 'AwayMade3-pt jump shot', 
#         'AwayDRB', 'HomeDRB', 'AwayORB', 'HomeORB','AwayFoul', 'HomeFoul', 'AwayBlock', 'HomeBlock', 
#         'AwayViolation', 'HomeViolation', 'AwayTurnover', 'HomeTurnover', 'AwayTimeout', 
#         'HomeTimeout', 'AwayFreeThrow', 'HomeFreeThrow', 'AwayMadeFreeThrow', 
#         'HomeMadeFreeThrow', 'AwayWonJumpball', 'HomeWonJumpball', 'AwaySub', 'HomeSub'
#     ]
    to_accumulate = [
        'Home 2-pt shot', 'Away 2-pt shot', 'HomeMade 2-pt shot', 'AwayMade 2-pt shot',
        'Home 3-pt shot', 'Away 3-pt shot', 'HomeMade 3-pt shot', 'AwayMade 3-pt shot',
        'AwayTurnover', 'HomeTurnover', 'AwayFreeThrow', 'HomeFreeThrow', 'AwayMadeFreeThrow', 
        'HomeMadeFreeThrow','AwayAssist', 'HomeAssist','AwayDRB', 'HomeDRB', 'AwayORB', 'HomeORB','AwayFoul', 'HomeFoul',
    ]
    
    accumulated = []
    for col in to_accumulate:
        df[col + "Total"] = df.groupby(['URL'])[col].cumsum()
        accumulated.append(col + "Total")
    for col in accumulated:
        mean = np.mean(df.groupby(['URL'])[col].max())
        std = np.std(df.groupby(['URL'])[col].max())
        df[col] = (df[col] - mean) / std
        
    ## add ground truth labels we will try to predict
    finalHomeScores = df.groupby(['URL'])["HomeScore"].max()
    finalAwayScores = df.groupby(['URL'])["AwayScore"].max()

    sizes = df.groupby(['URL']).size()
    homeScoresCol = np.array([])
    awayScoresCol = np.array([])
    for i in range(len(sizes)):
        homeScoresCol = np.concatenate((homeScoresCol, np.full(sizes[i], finalHomeScores[i])))
        awayScoresCol = np.concatenate((awayScoresCol, np.full(sizes[i], finalAwayScores[i])))

    df['HomeFinalScore'] = homeScoresCol
    df['AwayFinalScore'] = awayScoresCol
    
    df['HomeWin'] = (df['HomeFinalScore'] > df['AwayFinalScore']).astype(int)
    
    print('Finished processsing PBP Data')
    return df

def add_team_stats(df, team_path):
    return df

    print('Adding Team Stats')
    teams_df = pd.read_csv(team_path)
    abbrevs = []
    teams = []
    with open('teams.json', 'r') as f:
        teams = json.load(f)
    for teamname in teams_df['Team']:
        if teamname[-1] == '*':
            teamname = teamname[:-1]
        for i in range(len(teams)):
            if teams[i]['teamName'] == teamname:
                abbrev = teams[i]['abbreviation']
                if abbrev == 'BKN':
                    abbrevs.append('BRK')
                elif abbrev == 'PHX':
                    abbrevs.append('PHO')
                elif abbrev == 'CHA':
                    abbrevs.append('CHO')
                else:
                    abbrevs.append(teams[i]['abbreviation'])
    teams_df['Names'] = abbrevs
    teams_df.drop(['Team','Arena','Attend.','Attend./G'],1, inplace=True)
    columns = teams_df.columns
    df = df.merge(teams_df,left_on = 'AwayTeam',right_on = 'Names',how = 'left').reset_index()
    df = df.merge(teams_df,left_on = 'HomeTeam',right_on = 'Names',how = 'left').reset_index()
    for col in columns:
        if col == "Names":
            continue
        mean = np.mean(teams_df[col])
        std = np.std(teams_df[col])
        df[col + '_x'] = (df[col + '_x'] - mean) / std
        df[col + '_y'] = (df[col + '_y'] - mean) / std
    return df

In [35]:
original_df = pd.read_csv("NBA_PBP_2019-20.csv",  parse_dates=['Date','Time'])

In [36]:
df = process_df(original_df)
df = add_team_stats(df, "team2019.csv")
# df.drop(df.columns.difference(feature_list+ ['HomeTeam','AwayTeam','Date']), 1, inplace=True)
df

Finished reading in CSV


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

Finished processsing PBP Data


Unnamed: 0,Date,URL,Location,Time,WinningTeam,Quarter,SecLeft,AwayPlay,HomePlay,AwayTeam,...,HomeAssistTotal,AwayDRBTotal,HomeDRBTotal,AwayORBTotal,HomeORBTotal,AwayFoulTotal,HomeFoulTotal,HomeFinalScore,AwayFinalScore,HomeWin
0,2019-10-22,/boxscores/201910220LAC.html,STAPLES Center Los Angeles California,2021-05-06 22:30:00,LAC,1,720,Jump ball: J. McGee vs. I. Zubac (L. James gai...,,LAL,...,-5.081362,-6.347203,-6.707985,-3.386662,-3.530054,-4.566096,-4.487027,0.800000,0.133333,1
1,2019-10-22,/boxscores/201910220LAC.html,STAPLES Center Los Angeles California,2021-05-06 22:30:00,LAC,1,707,L. James makes 2-pt layup from 3 ft (assist by...,,LAL,...,-5.081362,-6.347203,-6.707985,-3.386662,-3.530054,-4.566096,-4.487027,0.800000,0.133333,1
2,2019-10-22,/boxscores/201910220LAC.html,STAPLES Center Los Angeles California,2021-05-06 22:30:00,LAC,1,700,,L. Shamet misses 3-pt jump shot from 27 ft,LAL,...,-5.081362,-6.347203,-6.707985,-3.386662,-3.530054,-4.566096,-4.487027,0.800000,0.133333,1
3,2019-10-22,/boxscores/201910220LAC.html,STAPLES Center Los Angeles California,2021-05-06 22:30:00,LAC,1,698,Defensive rebound by A. Davis,,LAL,...,-5.081362,-6.173513,-6.707985,-3.386662,-3.530054,-4.566096,-4.487027,0.800000,0.133333,1
4,2019-10-22,/boxscores/201910220LAC.html,STAPLES Center Los Angeles California,2021-05-06 22:30:00,LAC,1,683,D. Green makes 3-pt jump shot from 25 ft (assi...,,LAL,...,-5.081362,-6.173513,-6.707985,-3.386662,-3.530054,-4.566096,-4.487027,0.800000,0.133333,1
5,2019-10-22,/boxscores/201910220LAC.html,STAPLES Center Los Angeles California,2021-05-06 22:30:00,LAC,1,660,,P. Beverley misses 3-pt jump shot from 27 ft,LAL,...,-5.081362,-6.173513,-6.707985,-3.386662,-3.530054,-4.566096,-4.487027,0.800000,0.133333,1
6,2019-10-22,/boxscores/201910220LAC.html,STAPLES Center Los Angeles California,2021-05-06 22:30:00,LAC,1,657,Defensive rebound by L. James,,LAL,...,-5.081362,-5.999823,-6.707985,-3.386662,-3.530054,-4.566096,-4.487027,0.800000,0.133333,1
7,2019-10-22,/boxscores/201910220LAC.html,STAPLES Center Los Angeles California,2021-05-06 22:30:00,LAC,1,647,A. Davis misses 2-pt jump shot from 19 ft,,LAL,...,-5.081362,-5.999823,-6.707985,-3.386662,-3.530054,-4.566096,-4.487027,0.800000,0.133333,1
8,2019-10-22,/boxscores/201910220LAC.html,STAPLES Center Los Angeles California,2021-05-06 22:30:00,LAC,1,645,,Defensive rebound by Team,LAL,...,-5.081362,-5.999823,-6.529161,-3.386662,-3.530054,-4.566096,-4.487027,0.800000,0.133333,1
9,2019-10-22,/boxscores/201910220LAC.html,STAPLES Center Los Angeles California,2021-05-06 22:30:00,LAC,1,629,,K. Leonard misses 2-pt jump shot from 14 ft,LAL,...,-5.081362,-5.999823,-6.529161,-3.386662,-3.530054,-4.566096,-4.487027,0.800000,0.133333,1


In [37]:
cols = ['URL', 'Location', 'Time', 'WinningTeam', 'Quarter',
       'SecLeft', 'AwayTeam', 'AwayScore', 'HomeTeam',
       'HomeScore', 'ShotType', 'ShotOutcome', 'FoulType', 'Fouler', 'Fouled',
       'FreeThrowShooter', 'FreeThrowOutcome', 'FreeThrowNum', 'TurnoverPlayer', 'TurnoverType', 'TurnoverCause',
       'TurnoverCauser','Assister', 'ReboundType', 'Rebounder']
df[cols]

Unnamed: 0,URL,Location,Time,WinningTeam,Quarter,SecLeft,AwayTeam,AwayScore,HomeTeam,HomeScore,...,FreeThrowShooter,FreeThrowOutcome,FreeThrowNum,TurnoverPlayer,TurnoverType,TurnoverCause,TurnoverCauser,Assister,ReboundType,Rebounder
0,/boxscores/201910220LAC.html,STAPLES Center Los Angeles California,2021-05-06 22:30:00,LAC,1,720,LAL,-6.666667,LAC,-6.666667,...,,,,,,,,,,
1,/boxscores/201910220LAC.html,STAPLES Center Los Angeles California,2021-05-06 22:30:00,LAC,1,707,LAL,-6.533333,LAC,-6.666667,...,,,,,,,,A. Davis - davisan02,,
2,/boxscores/201910220LAC.html,STAPLES Center Los Angeles California,2021-05-06 22:30:00,LAC,1,700,LAL,-6.533333,LAC,-6.666667,...,,,,,,,,,,
3,/boxscores/201910220LAC.html,STAPLES Center Los Angeles California,2021-05-06 22:30:00,LAC,1,698,LAL,-6.533333,LAC,-6.666667,...,,,,,,,,,defensive,A. Davis - davisan02
4,/boxscores/201910220LAC.html,STAPLES Center Los Angeles California,2021-05-06 22:30:00,LAC,1,683,LAL,-6.333333,LAC,-6.666667,...,,,,,,,,L. James - jamesle01,,
5,/boxscores/201910220LAC.html,STAPLES Center Los Angeles California,2021-05-06 22:30:00,LAC,1,660,LAL,-6.333333,LAC,-6.666667,...,,,,,,,,,,
6,/boxscores/201910220LAC.html,STAPLES Center Los Angeles California,2021-05-06 22:30:00,LAC,1,657,LAL,-6.333333,LAC,-6.666667,...,,,,,,,,,defensive,L. James - jamesle01
7,/boxscores/201910220LAC.html,STAPLES Center Los Angeles California,2021-05-06 22:30:00,LAC,1,647,LAL,-6.333333,LAC,-6.666667,...,,,,,,,,,,
8,/boxscores/201910220LAC.html,STAPLES Center Los Angeles California,2021-05-06 22:30:00,LAC,1,645,LAL,-6.333333,LAC,-6.666667,...,,,,,,,,,defensive,Team
9,/boxscores/201910220LAC.html,STAPLES Center Los Angeles California,2021-05-06 22:30:00,LAC,1,629,LAL,-6.333333,LAC,-6.666667,...,,,,,,,,,,


In [39]:
len(df.columns), df.columns

(85,
 Index(['Date', 'URL', 'Location', 'Time', 'WinningTeam', 'Quarter', 'SecLeft',
        'AwayPlay', 'HomePlay', 'AwayTeam', 'AwayScore', 'HomeTeam',
        'HomeScore', 'ShotType', 'ShotOutcome', 'FoulType', 'Fouler', 'Fouled',
        'FreeThrowShooter', 'FreeThrowOutcome', 'FreeThrowNum',
        'TurnoverPlayer', 'TurnoverType', 'TurnoverCause', 'TurnoverCauser',
        'Assister', 'ReboundType', 'Rebounder', 'hour', 'hour_sin', 'hour_cos',
        'AwayPlayOneHot', 'HomePlayOneHot', 'AwayAssist', 'HomeAssist',
        '2-pt shot', '3-pt shot', 'Home 2-pt shot', 'Away 2-pt shot',
        'Home 3-pt shot', 'Away 3-pt shot', 'HomeMade 2-pt shot',
        'AwayMade 2-pt shot', 'HomeMade 3-pt shot', 'AwayMade 3-pt shot',
        'AwayDRB', 'HomeDRB', 'AwayORB', 'HomeORB', 'AwayFoul', 'HomeFoul',
        'AwayTurnover', 'HomeTurnover', 'AwayFreeThrow', 'HomeFreeThrow',
        'AwayMadeFreeThrow', 'HomeMadeFreeThrow', 'GameTimeLeft',
        'GameTimeLeftSin', 'GameTimeLeftCos', '

In [40]:
df.to_csv("2019pbpfeatures_calvin.csv",index=False)

In [7]:
df['HomePlayOneHot'][1:].append(pd.Series([0])).reset_index().drop(['index'],1)[0]


0         0
1         1
2         0
3         0
4         1
5         0
6         0
7         1
8         1
9         1
10        1
11        0
12        1
13        0
14        1
15        1
16        0
17        1
18        0
19        0
20        0
21        0
22        0
23        1
24        1
25        0
26        0
27        0
28        1
29        1
         ..
539235    1
539236    1
539237    0
539238    1
539239    1
539240    0
539241    0
539242    0
539243    1
539244    0
539245    1
539246    0
539247    0
539248    0
539249    1
539250    0
539251    1
539252    0
539253    1
539254    0
539255    1
539256    1
539257    1
539258    1
539259    1
539260    0
539261    1
539262    0
539263    0
539264    0
Name: 0, Length: 539265, dtype: int64

In [8]:
df.isna().sum()

Date                      0
AwayTeam                  0
AwayScore                 0
HomeTeam                  0
HomeScore                 0
hour_sin                  0
hour_cos                  0
playoff                   0
regular                   0
AwayPlayOneHot            0
HomePlayOneHot            0
AwayAssist                0
HomeAssist                0
Home2-pt dunk             0
Away2-pt dunk             0
Home2-pt hook shot        0
Away2-pt hook shot        0
Home2-pt jump shot        0
Away2-pt jump shot        0
Home2-pt layup            0
Away2-pt layup            0
Home3-pt jump shot        0
Away3-pt jump shot        0
HomeMade2-pt dunk         0
AwayMade2-pt dunk         0
HomeMade2-pt hook shot    0
AwayMade2-pt hook shot    0
HomeMade2-pt jump shot    0
AwayMade2-pt jump shot    0
HomeMade2-pt layup        0
                         ..
ORB%_x                    0
FT/FGA_x                  0
eFG%.1_x                  0
TOV%.1_x                  0
DRB%_x              

In [19]:
df['ShotType'].str.contains('2-pt').fillna(False).astype(int)

0         0
1         1
2         0
3         1
4         1
5         0
6         1
7         0
8         0
9         0
10        1
11        0
12        0
13        0
14        0
15        0
16        1
17        0
18        0
19        0
20        0
21        0
22        0
23        0
24        0
25        0
26        0
27        0
28        1
29        0
         ..
539235    1
539236    0
539237    0
539238    0
539239    0
539240    0
539241    0
539242    0
539243    1
539244    1
539245    0
539246    0
539247    0
539248    0
539249    0
539250    0
539251    0
539252    0
539253    0
539254    1
539255    0
539256    0
539257    0
539258    0
539259    0
539260    1
539261    0
539262    0
539263    0
539264    0
Name: ShotType, Length: 539265, dtype: int64

In [10]:
for name in pd.get_dummies(df['AwayTeam']).columns:
    if name not in set(teams_df['Names']):
        print(name)

NameError: name 'teams_df' is not defined

In [12]:
calvin = pd.read_csv('2019pbpfeatures.csv')
calvin.head()

Unnamed: 0,Date,AwayTeam,AwayScore,HomeTeam,HomeScore,hour_sin,hour_cos,playoff,regular,AwayPlayOneHot,...,3PAr_y,TS%_y,eFG%_y,TOV%_y,ORB%_y,FT/FGA_y,eFG%.1_y,TOV%.1_y,DRB%_y,FT/FGA.1_y
0,2019-10-22,LAL,-6.666667,LAC,-6.666667,-0.269797,0.962917,0,1,1,...,-0.215859,0.896664,0.479271,-0.271315,0.653578,1.786168,-1.38392,-0.537824,0.081352,0.227588
1,2019-10-22,LAL,-6.533333,LAC,-6.666667,-0.269797,0.962917,0,1,1,...,-0.215859,0.896664,0.479271,-0.271315,0.653578,1.786168,-1.38392,-0.537824,0.081352,0.227588
2,2019-10-22,LAL,-6.533333,LAC,-6.666667,-0.269797,0.962917,0,1,0,...,-0.215859,0.896664,0.479271,-0.271315,0.653578,1.786168,-1.38392,-0.537824,0.081352,0.227588
3,2019-10-22,LAL,-6.533333,LAC,-6.666667,-0.269797,0.962917,0,1,1,...,-0.215859,0.896664,0.479271,-0.271315,0.653578,1.786168,-1.38392,-0.537824,0.081352,0.227588
4,2019-10-22,LAL,-6.333333,LAC,-6.666667,-0.269797,0.962917,0,1,1,...,-0.215859,0.896664,0.479271,-0.271315,0.653578,1.786168,-1.38392,-0.537824,0.081352,0.227588


In [29]:
df['GameTimeLeft'].min(), df['GameTimeLeft'].max()

(-0.5, 1.0)