### Data Cleaning + Engineering

NBA Season Date Source: https://en.wikipedia.org/wiki/2017_NBA_Playoffs

In [15]:
import pandas as pd
import numpy as np

In [2]:
# Bring in the data
df = pd.read_csv("simple_game_data.txt")
df = df.drop("Unnamed: 0", axis = 1)
df.head(1)

Unnamed: 0,Team,Opponent,Team-Score,Opponent-Score,Win,Date,Time,FG%,3P%,FT%,PTS,TS%,eFG%,ORB%,DRB%,BLK%,TOV%,ORtg
0,Cleveland Cavaliers,Washington Wizards,94,84,1,10/30/12,7:00 PM,0.456,0.35,0.682,94,0.53,0.5,46.2,66.7,8.6,18.4,106.9


### Create 'Seasons' Variable

In [3]:
# Prepare the data
df.Date = pd.to_datetime(df.Date)

In [4]:
df['Season'] = ""

for i in range(0, len(df)):
    # 2012-13 Season: October 30, 2012 - June 20, 2013
    if df.iloc[i, 5] >= pd.Timestamp('2012-10-30 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2013-6-20 00:00:00'):
        df.iloc[i, 18] = '2012-13'
    
    # 2013-14 Season: October 29, 2013 - June 15, 2014
    elif df.iloc[i, 5] >= pd.Timestamp('2013-10-29 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2014-6-15 00:00:00'):
        df.iloc[i, 18] = '2013-14'
    
    # 2014-15 Season: October 29, 2014 - June 16, 2015
    elif df.iloc[i, 5] >= pd.Timestamp('2014-10-28 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2015-6-16 00:00:00'):
        df.iloc[i, 18] = '2014-15'
    
    # 2015-16 Season: October 27, 2015 - June 19, 2016
    elif df.iloc[i, 5] >= pd.Timestamp('2015-10-27 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2016-6-19 00:00:00'):
        df.iloc[i, 18] = '2015-16'
    
    # 2016-17 Season: October 25, 2016 - June 12, 2017
    elif df.iloc[i, 5] >= pd.Timestamp('2016-10-25 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2017-6-12 00:00:00'):
        df.iloc[i, 18] = '2016-17'
    else: 
        df.iloc[i, 18] = None

In [5]:
df['Season'].value_counts()

2013-14    2638
2015-16    2632
2012-13    2628
2014-15    2622
2016-17    2614
Name: Season, dtype: int64

In [6]:
df.head(1)

Unnamed: 0,Team,Opponent,Team-Score,Opponent-Score,Win,Date,Time,FG%,3P%,FT%,PTS,TS%,eFG%,ORB%,DRB%,BLK%,TOV%,ORtg,Season
0,Cleveland Cavaliers,Washington Wizards,94,84,1,2012-10-30,7:00 PM,0.456,0.35,0.682,94,0.53,0.5,46.2,66.7,8.6,18.4,106.9,2012-13


### Create 'Playoff' Game Dummy

In [7]:
df['Playoff'] = ""

for i in range(0, len(df)):
    # 2013 Playoffs: April 20 - June 20, 2013
    if df.iloc[i, 5] >= pd.Timestamp('2013-4-20 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2013-6-20 00:00:00'):
        df.iloc[i, 19] = 1
    
    # 2014 Playoffs: April 19 - June 15, 2014
    elif df.iloc[i, 5] >= pd.Timestamp('2014-4-19 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2014-6-15 00:00:00'):
        df.iloc[i, 19] = 1
    
    # 2015 Playoffs: April 18 - June 16, 2015
    elif df.iloc[i, 5] >= pd.Timestamp('2015-4-18 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2015-6-16 00:00:00'):
        df.iloc[i, 19] = 1
    
    # 2016 Playoffs: April 16 - June 19, 2016
    elif df.iloc[i, 5] >= pd.Timestamp('2016-4-16 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2016-6-19 00:00:00'):
        df.iloc[i, 19] = 1
    
    # 2017 Playoffs: April 15 - June 12, 2017
    elif df.iloc[i, 5] >= pd.Timestamp('2017-4-15 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2017-6-12 00:00:00'):
        df.iloc[i, 19] = 1
    else: 
        df.iloc[i, 19] = 0

In [8]:
df['Playoff'].value_counts()

0    12298
1      836
Name: Playoff, dtype: int64

### Filter by Season

In [9]:
# Regular Season (Non-Playoff) Games
df_12_13 = df[(df.Season == "2012-13") & (df.Playoff == 0)]
df_13_14 = df[(df.Season == "2013-14") & (df.Playoff == 0)]
df_14_15 = df[(df.Season == "2014-15") & (df.Playoff == 0)]
df_15_16 = df[(df.Season == "2015-16") & (df.Playoff == 0)]
df_16_17 = df[(df.Season == "2016-17") & (df.Playoff == 0)]

In [10]:
df_16_17.head(1)

Unnamed: 0,Team,Opponent,Team-Score,Opponent-Score,Win,Date,Time,FG%,3P%,FT%,PTS,TS%,eFG%,ORB%,DRB%,BLK%,TOV%,ORtg,Season,Playoff
10520,San Antonio Spurs,Golden State Warriors,129,100,1,2016-10-25,10:30 PM,0.48,0.5,0.885,129,0.589,0.541,43.8,81.0,5.8,10.6,131.3,2016-17,0


In [33]:
# Turn into function afterwards
nba = df_12_13.sample(1000)
#num_games = 5
games = []

for i in range(0, len(nba)):
    out = nba
    
    Team = out.iloc[i, 0]
    Date = out.iloc[i, 5]
    temp = out[(out.Team == Team) & (out.Date <= Date)].sort_values(by='Date', ascending = True)[-num_games:]
    temp = temp[:-1]

    out.iloc[i, 7] = np.mean(temp['FG%'])
    out.iloc[i, 8] = np.mean(temp['3P%'])
    out.iloc[i, 9] = np.mean(temp['FT%'])
    
    out.iloc[i, 11] = np.mean(temp['TS%'])
    out.iloc[i, 12] = np.mean(temp['eFG%'])
    out.iloc[i, 13] = np.mean(temp['ORB%'])
    out.iloc[i, 14] = np.mean(temp['DRB%'])
    out.iloc[i, 15] = np.mean(temp['BLK%'])
    out.iloc[i, 16] = np.mean(temp['TOV%'])
    out.iloc[i, 17] = np.mean(temp['ORtg'])

In [36]:
def get_averages(dataframe):
    for i in range(0, len(dataframe)):
        out = dataframe
    
        Team = out.iloc[i, 0]
        Date = out.iloc[i, 5]
        temp = out[(out.Team == Team) & (out.Date <= Date)].sort_values(by='Date', ascending = True)[-num_games:]
        temp = temp[:-1]
    
        out.iloc[i, 7] = np.mean(temp['FG%'])
        out.iloc[i, 8] = np.mean(temp['3P%'])
        out.iloc[i, 9] = np.mean(temp['FT%'])
    
        out.iloc[i, 11] = np.mean(temp['TS%'])
        out.iloc[i, 12] = np.mean(temp['eFG%'])
        out.iloc[i, 13] = np.mean(temp['ORB%'])
        out.iloc[i, 14] = np.mean(temp['DRB%'])
        out.iloc[i, 15] = np.mean(temp['BLK%'])
        out.iloc[i, 16] = np.mean(temp['TOV%'])
        out.iloc[i, 17] = np.mean(temp['ORtg'])
    
    return out

In [None]:
df_12_13_avg = get_averages(df_12_13)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
