### Data Cleaning + Engineering

NBA Season Date Source: https://en.wikipedia.org/wiki/2017_NBA_Playoffs

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

In [2]:
# Bring in the data
df = pd.read_csv("game-data-revised.csv")
df.head(1)

Unnamed: 0,Team,Opponent,Team-Score,Opponent-Score,Win,Date,Time,Location,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,ORtg,DRtg
0,Cleveland Cavaliers,Washington Wizards,94,84,1,10/30/12,7:00 PM,"Quicken Loans Arena, Cleveland, Ohio",36,79,0.456,7,20,0.35,15,22,0.682,18,36,54,22,7,5,20,21,0.53,0.5,0.253,0.278,46.2,66.7,58.1,61.1,8.0,8.6,18.4,106.9,95.5


### Create 'Seasons' Variable

In [3]:
# Prepare the data
df.Date = pd.to_datetime(df.Date)

In [4]:
position = len(df.columns)
df['Season'] = ""

for i in range(0, len(df)):
    # 2012-13 Season: October 30, 2012 - June 20, 2013
    if df.iloc[i, 5] >= pd.Timestamp('2012-10-30 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2013-6-20 00:00:00'):
        df.iloc[i, position] = '2012-13'
    
    # 2013-14 Season: October 29, 2013 - June 15, 2014
    elif df.iloc[i, 5] >= pd.Timestamp('2013-10-29 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2014-6-15 00:00:00'):
        df.iloc[i, position] = '2013-14'
    
    # 2014-15 Season: October 29, 2014 - June 16, 2015
    elif df.iloc[i, 5] >= pd.Timestamp('2014-10-28 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2015-6-16 00:00:00'):
        df.iloc[i, position] = '2014-15'
    
    # 2015-16 Season: October 27, 2015 - June 19, 2016
    elif df.iloc[i, 5] >= pd.Timestamp('2015-10-27 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2016-6-19 00:00:00'):
        df.iloc[i, position] = '2015-16'
    
    # 2016-17 Season: October 25, 2016 - June 12, 2017
    elif df.iloc[i, 5] >= pd.Timestamp('2016-10-25 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2017-6-12 00:00:00'):
        df.iloc[i, position] = '2016-17'
    else: 
        df.iloc[i, position] = None

In [5]:
df['Season'].value_counts()

2013-14    2624
2016-17    2592
2015-16    2500
2012-13    2500
2014-15    2498
Name: Season, dtype: int64

In [6]:
df.head(1)

Unnamed: 0,Team,Opponent,Team-Score,Opponent-Score,Win,Date,Time,Location,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,ORtg,DRtg,Season
0,Cleveland Cavaliers,Washington Wizards,94,84,1,2012-10-30,7:00 PM,"Quicken Loans Arena, Cleveland, Ohio",36,79,0.456,7,20,0.35,15,22,0.682,18,36,54,22,7,5,20,21,0.53,0.5,0.253,0.278,46.2,66.7,58.1,61.1,8.0,8.6,18.4,106.9,95.5,2012-13


### Create 'Playoff' Game Dummy

In [7]:
position = len(df.columns)
df['Playoff'] = ""

for i in range(0, len(df)):
    # 2013 Playoffs: April 20 - June 20, 2013
    if df.iloc[i, 5] >= pd.Timestamp('2013-4-20 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2013-6-20 00:00:00'):
        df.iloc[i, position] = 1
    
    # 2014 Playoffs: April 19 - June 15, 2014
    elif df.iloc[i, 5] >= pd.Timestamp('2014-4-19 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2014-6-15 00:00:00'):
        df.iloc[i, position] = 1
    
    # 2015 Playoffs: April 18 - June 16, 2015
    elif df.iloc[i, 5] >= pd.Timestamp('2015-4-18 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2015-6-16 00:00:00'):
        df.iloc[i, position] = 1
    
    # 2016 Playoffs: April 16 - June 19, 2016
    elif df.iloc[i, 5] >= pd.Timestamp('2016-4-16 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2016-6-19 00:00:00'):
        df.iloc[i, position] = 1
    
    # 2017 Playoffs: April 15 - June 12, 2017
    elif df.iloc[i, 5] >= pd.Timestamp('2017-4-15 00:00:00') and df.iloc[i, 5] <= pd.Timestamp('2017-6-12 00:00:00'):
        df.iloc[i, position] = 1
    else: 
        df.iloc[i, position] = 0

In [8]:
df['Playoff'].value_counts()

0    11878
1      836
Name: Playoff, dtype: int64

### Filter by Season

In [9]:
# Regular Season (Non-Playoff) Games
df_12_13 = df[(df.Season == "2012-13") & (df.Playoff == 0)]
df_13_14 = df[(df.Season == "2013-14") & (df.Playoff == 0)]
df_14_15 = df[(df.Season == "2014-15") & (df.Playoff == 0)]
df_15_16 = df[(df.Season == "2015-16") & (df.Playoff == 0)]
df_16_17 = df[(df.Season == "2016-17") & (df.Playoff == 0)]

In [10]:
df_16_17.head(1)

Unnamed: 0,Team,Opponent,Team-Score,Opponent-Score,Win,Date,Time,Location,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,ORtg,DRtg,Season,Playoff
10122,San Antonio Spurs,Golden State Warriors,129,100,1,2016-10-25,10:30 PM,"Oracle Arena, Oakland, California",47,98,0.48,12,24,0.5,23,26,0.885,21,34,55,25,13,3,13,19,0.589,0.541,0.245,0.265,43.8,81.0,61.1,53.2,13.2,5.8,10.6,131.3,101.8,2016-17,0


### Combine & Get Averages

In [34]:
def nba_jam(dataframe, number_of_games):
    games = []
    for i in range(0, len(dataframe)):
        team_df = dataframe[(dataframe.Team == dataframe.iloc[i,0]) & (dataframe.Date < dataframe.iloc[i,5])].sort_values(by='Date', ascending = True).iloc[-number_of_games:]
        opponent_df = dataframe[(dataframe.Team == dataframe.iloc[i,1]) & (dataframe.Date < dataframe.iloc[i,5])].sort_values(by='Date', ascending = True).iloc[-number_of_games:]
        
        if len(team_df) > 0 and len(opponent_df) > 0:
            # Team Data
            attributes = []
            metrics = []
            for column in team_df.columns:
                if column == 'Team' or column == 'Opponent' or column == 'Date' or column == 'Time' or column == 'Location' or column == 'Season' or column == 'Playoff':
                    attributes.append(team_df.iloc[-1][column]) # Grabbing first row of elements?
                else:
                    metrics.append(np.mean(team_df[column]))
            team = attributes + metrics
        
            # Opponent Data
            opponent = [] 
            for column in opponent_df.columns:
                if column == 'Team' or column == 'Opponent' or column == 'Date' or column == 'Time' or column == 'Location' or column == 'Season' or column == 'Playoff' or column == 'Team-Score' or column == "Opponent-Score":
                    None
                else:
                    opponent.append(np.mean(opponent_df[column]))
        
            # Target Data
            target = []
            for column in team_df.columns:
                if column == 'Win' or column == 'Team-Score' or column == 'Opponent-Score':
                    target.append(team_df.iloc[-1][column])
        
            row = team + opponent + target

            # Combine
            games.append(row)
        
    # Column Names
    attributes_c = list(dataframe.columns[[0, 1, 5, 6, 7, 38, 39, 2, 3, 4]])
    team_c = list(dataframe.columns[8:38])
    opponent_c = ["Win"] + list(dataframe.columns[8:38])
    for i in range(0, len(team_c)):
        team_c[i] = "team_avg_" + team_c[i]
    for i in range(0, len(opponent_c)):
        opponent_c[i] = "opp_avg_" + opponent_c[i]
    column_names = attributes_c + team_c + opponent_c + ["Team-Score", "Opponent-Score", "Win"]
    column_names[7:10] = ["team_avg_score", "opp_avg_score", "team_avg_win"]
        
    # Create DataFrame
    nba_df = pd.DataFrame(games, columns = column_names)
    nba_df = nba_df.drop(['Playoff'], axis = 1)
        
    return nba_df

### Export Data

In [35]:
# AVERAGE Regular Season (Non-Playoff) Games
number_of_games = 3
df_12_13_avg = nba_jam(df_12_13, number_of_games)
df_13_14_avg = nba_jam(df_13_14, number_of_games)
#df_14_15_avg = nba_jam(df_14_15, number_of_games)
#df_15_16_avg = nba_jam(df_15_16, number_of_games)
#df_16_17_avg = nba_jam(df_16_17, number_of_games)

In [36]:
# Write to csv files
df_12_13_avg.to_csv("df_12_13_avg.csv", index = False)
df_13_14_avg.to_csv("df_13_14_avg.csv", index = False)
#df_14_15_avg.to_csv("df_14_15_avg.csv", index = False)
#df_15_16_avg.to_csv("df_15_16_avg.csv", index = False)
#df_16_17_avg.to_csv("df_16_17_avg.csv", index = False)

### Data Modeling

In [53]:
# Core Packages
import seaborn as sns
import matplotlib.pyplot as plt
import os 
%matplotlib inline

# ML Packages
from sklearn.linear_model import SGDRegressor, ElasticNetCV, LogisticRegression
from sklearn.metrics import mean_squared_error, make_scorer, f1_score, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split, learning_curve, RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [101]:
train = pd.read_csv("df_12_13_avg.csv")
test = pd.read_csv("df_13_14_avg.csv")

train = train.drop(['Team', 'Opponent', 'Team-Score', 'Location', 'Opponent-Score', 'Date', 'Time', 'Season'], axis = 1)
test = test.drop(['Team', 'Opponent', 'Team-Score', 'Location', 'Opponent-Score', 'Date', 'Time', 'Season'], axis = 1)

target_train = train['Win']
target_test = test['Win']
train = train.drop(['Win'], axis = 1)
test = test.drop(['Win'], axis = 1)

In [102]:
# 4. Scale the data
scaler = StandardScaler()

scaler.fit(train)
scaled_train = scaler.transform(train)

scaler.fit(test)
scaled_test = scaler.transform(test)

In [103]:
len(target_train) == len(scaled_train)

True

In [104]:
len(target_test) == len(scaled_test)

True

In [69]:
print("Train Dimensions:", scaled_train.shape, "\nTest Dimensions:", scaled_test.shape)

Train Dimensions: (2290, 64) 
Test Dimensions: (2412, 64)


In [70]:
X_train, X_test, y_train, y_test = train_test_split(scaled_train, target, test_size=0.2, random_state=42)

In [71]:
def evaluation_metrics(y_train, y_test, train_predictions, test_predictions):
    rounding = 2
    print('Accuracy:')
    print("  Train: {}".format(round(accuracy_score(y_train, train_predictions),rounding)), "  Test: {}".format(round(accuracy_score(y_test, test_predictions),rounding)))

    print('Precision:')
    print("  Train: {}".format(round(precision_recall_fscore_support(y_train, train_predictions)[0][0],rounding)), "  Test: {}".format(round(precision_recall_fscore_support(y_test, test_predictions)[0][0],rounding)))

    print('Recall:')
    print("  Train: {}".format(round(precision_recall_fscore_support(y_train, train_predictions)[1][0],rounding)), "  Test: {}".format(round(precision_recall_fscore_support(y_test, test_predictions)[1][0],rounding)))

    print('F1:')
    print("  Train: {}".format(round(precision_recall_fscore_support(y_train, train_predictions)[2][0],rounding)), "  Test: {}".format(round(precision_recall_fscore_support(y_test, test_predictions)[2][0],rounding)))

In [72]:
param_dist = {"n_estimators": [500], "max_depth": [3]}
model = RandomForestClassifier()
rf = GridSearchCV(model, param_grid = param_dist, scoring='f1')
rf.fit(X_train, y_train)
rf_train_predictions = rf.predict(X_train)
rf_test_predictions = rf.predict(X_test)
print("Best Params: {}".format(rf.best_params_))

Best Params: {'max_depth': 3, 'n_estimators': 500}


In [73]:
evaluation_metrics(y_train, y_test, rf_train_predictions, rf_test_predictions)

Accuracy:
  Train: 0.78   Test: 0.79
Precision:
  Train: 0.78   Test: 0.79
Recall:
  Train: 0.78   Test: 0.78
F1:
  Train: 0.78   Test: 0.78


In [105]:
accuracy_score(target_train, rf.predict(scaled_train))

0.7829694323144105

In [106]:
accuracy_score(target_test, rf.predict(scaled_test))

0.7599502487562189

In [96]:
len(target_test)

2290

In [93]:
len(scaled_test)

2412

In [77]:
len(scaled_test)

2412