In [102]:
# Imports
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Dataset and Preprocessing

Sources: 
- 2020-21 Season Game Stats: https://www.basketball-reference.com/leagues/NBA_2021_games.html

In [103]:
# Import datasets, perform data cleaning, merge
filenames = ['jan.csv','feb.csv','mar.csv','apr.csv','may.csv','jun.csv','jul.csv']
li = []

# Renaming problematic csv
df = pd.read_csv('gamedata/dec.csv', sep=',')
df = df.rename(columns={"VisitorNeutral": "Visitor/Neutral", "HomeNeutral": "Home/Neutral"})
li.append(df)

# Merging all Data
for file in filenames:
    df = pd.read_csv('gamedata/'+ file, sep=',')
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.loc[:, ~df.columns.str.contains('^Notes')]
print(df.shape)
df.head(100)

(1171, 7)


Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Attend.
0,Tue Dec 22 2020,700p,Golden State Warriors,99,Brooklyn Nets,125,0.0
1,Tue Dec 22 2020,1000p,Los Angeles Clippers,116,Los Angeles Lakers,109,0.0
2,Wed Dec 23 2020,700p,Charlotte Hornets,114,Cleveland Cavaliers,121,300.0
3,Wed Dec 23 2020,700p,New York Knicks,107,Indiana Pacers,121,0.0
4,Wed Dec 23 2020,700p,Miami Heat,107,Orlando Magic,113,3396.0
...,...,...,...,...,...,...,...
95,Mon Jan 4 2021,7:30p,Boston Celtics,126,Toronto Raptors,114,3740.0
96,Mon Jan 4 2021,8:00p,Dallas Mavericks,113,Houston Rockets,100,3070.0
97,Mon Jan 4 2021,8:00p,Detroit Pistons,115,Milwaukee Bucks,125,0.0
98,Mon Jan 4 2021,8:00p,Indiana Pacers,118,New Orleans Pelicans,116,750.0


In [104]:
# remove data with invalid values
# Reference: https://stackoverflow.com/questions/31323499/sklearn-error-valueerror-input-contains-nan-infinity-or-a-value-too-large-for
print('Shape before cleaning', df.shape)
df.dropna(inplace=True)
good_indices = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
df = df[good_indices]
print('Shape after cleaning', df.shape)

Shape before cleaning (1171, 7)
Shape after cleaning (1164, 7)


In [105]:
# remove start (ET) and Attend. since it is not relevant
cols = ['Attend.', 'Start (ET)']
df_removed = df[cols]
df.drop(cols, 1, inplace=True)
print('Shape after column removal', df.shape)
df.head(10)

Shape after column removal (1164, 5)


Unnamed: 0,Date,Visitor/Neutral,PTS,Home/Neutral,PTS.1
0,Tue Dec 22 2020,Golden State Warriors,99,Brooklyn Nets,125
1,Tue Dec 22 2020,Los Angeles Clippers,116,Los Angeles Lakers,109
2,Wed Dec 23 2020,Charlotte Hornets,114,Cleveland Cavaliers,121
3,Wed Dec 23 2020,New York Knicks,107,Indiana Pacers,121
4,Wed Dec 23 2020,Miami Heat,107,Orlando Magic,113
5,Wed Dec 23 2020,Washington Wizards,107,Philadelphia 76ers,113
6,Wed Dec 23 2020,New Orleans Pelicans,113,Toronto Raptors,99
7,Wed Dec 23 2020,Milwaukee Bucks,121,Boston Celtics,122
8,Wed Dec 23 2020,Atlanta Hawks,124,Chicago Bulls,104
9,Wed Dec 23 2020,San Antonio Spurs,131,Memphis Grizzlies,119


In [106]:
# Renaming PTS Home and PTS Visitor to avoid ambiguity
df = df.rename(columns={"PTS": "Visitor PTS", "PTS.1": "Home PTS"})
df.head(5)

Unnamed: 0,Date,Visitor/Neutral,Visitor PTS,Home/Neutral,Home PTS
0,Tue Dec 22 2020,Golden State Warriors,99,Brooklyn Nets,125
1,Tue Dec 22 2020,Los Angeles Clippers,116,Los Angeles Lakers,109
2,Wed Dec 23 2020,Charlotte Hornets,114,Cleveland Cavaliers,121
3,Wed Dec 23 2020,New York Knicks,107,Indiana Pacers,121
4,Wed Dec 23 2020,Miami Heat,107,Orlando Magic,113


In [107]:
# Check for draws, ensure this value is 0
draw_count = len(df.loc[df['Visitor PTS'] == df['Home PTS']])

if draw_count > 0:
    print("Error, there is a draw!")

# Adding in Column to determine which team one 1 corresponds to Visitor won, 0 corresponds to draw, 
df['Outcome'] = np.where(df['Visitor PTS'] > df['Home PTS'], 1, 0)
df.head(5)

Unnamed: 0,Date,Visitor/Neutral,Visitor PTS,Home/Neutral,Home PTS,Outcome
0,Tue Dec 22 2020,Golden State Warriors,99,Brooklyn Nets,125,0
1,Tue Dec 22 2020,Los Angeles Clippers,116,Los Angeles Lakers,109,1
2,Wed Dec 23 2020,Charlotte Hornets,114,Cleveland Cavaliers,121,0
3,Wed Dec 23 2020,New York Knicks,107,Indiana Pacers,121,0
4,Wed Dec 23 2020,Miami Heat,107,Orlando Magic,113,0


# Supervised Learning

- Linear Regression Model
- Predictional Analysis
- Tuning Model Parameters
- Model Metrics