In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime, date
import os

## Read in Games and Game Details datasets

In [2]:
games = pd.read_csv('nba-games/games1.csv')
game_details = pd.read_csv('nba-games/games_details.csv')
teams = pd.read_csv('nba-games/teams.csv')
teams.reset_index(drop=True)
teams.head()

Unnamed: 0,LEAGUE_ID,TEAM_ID,MIN_YEAR,MAX_YEAR,ABBREVIATION,NICKNAME,YEARFOUNDED,CITY,ARENA,ARENACAPACITY,OWNER,GENERALMANAGER,HEADCOACH,DLEAGUEAFFILIATION
0,0,1610612737,1949,2019,ATL,Hawks,1949,Atlanta,State Farm Arena,18729.0,Tony Ressler,Travis Schlenk,Lloyd Pierce,Erie Bayhawks
1,0,1610612738,1946,2019,BOS,Celtics,1946,Boston,TD Garden,18624.0,Wyc Grousbeck,Danny Ainge,Brad Stevens,Maine Red Claws
2,0,1610612740,2002,2019,NOP,Pelicans,2002,New Orleans,Smoothie King Center,,Tom Benson,Trajan Langdon,Alvin Gentry,No Affiliate
3,0,1610612741,1966,2019,CHI,Bulls,1966,Chicago,United Center,21711.0,Jerry Reinsdorf,Gar Forman,Jim Boylen,Windy City Bulls
4,0,1610612742,1980,2019,DAL,Mavericks,1980,Dallas,American Airlines Center,19200.0,Mark Cuban,Donnie Nelson,Rick Carlisle,Texas Legends


## Merge game dataframes, drop rows with NAN values in OREB, PLUS_MINUS columns

In [3]:
games_df = pd.merge(games, game_details, on='GAME_ID', how='inner') 
games_df = games_df.dropna(how='any', subset=['OREB', 'PLUS_MINUS'])

#Create "Meta-data" Dataframe for game IDs, Team IDs, Team Names, Dates 
games_meta = games[['GAME_DATE_EST', 'GAME_ID', 'HOME_TEAM_ID','VISITOR_TEAM_ID']].copy() 
games_meta['HOME_TEAM'] = ' ' 
games_meta['VISITOR_TEAM'] = ' ' 
games_meta.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,HOME_TEAM,VISITOR_TEAM
0,12/2/2019,21900293,1610612766,1610612756,,
1,12/2/2019,21900294,1610612755,1610612762,,
2,12/2/2019,21900295,1610612737,1610612744,,
3,12/2/2019,21900296,1610612763,1610612754,,
4,12/2/2019,21900297,1610612749,1610612752,,


In [4]:
## Drop string-value columns. Re-check with logistic equivalents?
games_df.drop(['GAME_DATE_EST','TEAM_ABBREVIATION','TEAM_CITY', 'PLAYER_NAME','START_POSITION', 'MIN'], axis=1, inplace=True)
games_df.head()

Unnamed: 0,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,...,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS
0,21900293,1610612766,1610612756,2019,1610612766,104,0.452,0.808,0.28,27,...,0.0,3.0,3.0,0.0,2.0,1.0,0.0,2.0,23.0,11.0
1,21900293,1610612766,1610612756,2019,1610612766,104,0.452,0.808,0.28,27,...,3.0,7.0,10.0,2.0,0.0,1.0,0.0,3.0,16.0,6.0
2,21900293,1610612766,1610612756,2019,1610612766,104,0.452,0.808,0.28,27,...,1.0,5.0,6.0,4.0,0.0,2.0,1.0,2.0,12.0,3.0
3,21900293,1610612766,1610612756,2019,1610612766,104,0.452,0.808,0.28,27,...,1.0,3.0,4.0,4.0,1.0,1.0,5.0,3.0,23.0,8.0
4,21900293,1610612766,1610612756,2019,1610612766,104,0.452,0.808,0.28,27,...,0.0,1.0,1.0,13.0,4.0,1.0,0.0,1.0,9.0,6.0


In [5]:
games.drop(['TEAM_ID_home','TEAM_ID_away'], axis=1, inplace=True)
games['GAME_DATE_EST'] = (pd.to_datetime(games['GAME_DATE_EST']))
games.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2019-12-02,21900293,1610612766,1610612756,2019,104,0.452,0.808,0.28,27,53,109,0.443,0.733,0.214,30,37,0
1,2019-12-02,21900294,1610612755,1610612762,2019,103,0.407,0.696,0.5,24,47,94,0.469,0.65,0.227,23,42,1
2,2019-12-02,21900295,1610612737,1610612744,2019,104,0.461,0.933,0.276,21,38,79,0.398,0.833,0.176,17,46,1
3,2019-12-02,21900296,1610612763,1610612754,2019,104,0.427,0.947,0.263,23,38,117,0.5,0.867,0.421,31,44,0
4,2019-12-02,21900297,1610612749,1610612752,2019,132,0.558,0.625,0.457,30,57,88,0.308,0.667,0.256,13,42,1


In [6]:
home = input("Enter Home Team Abbreviation: ") 
visitor = input("Enter Visitor Team Abbreviation: ")
game_date = input ("Enter Month, Year of Prediction Request (MM/DD/YYYY)")

for i in range(len(teams)):
    if teams.iloc[i]['ABBREVIATION'] == home:
        home_id = teams.iloc[i]['TEAM_ID']
    if teams.iloc[i]['ABBREVIATION'] == visitor:
        visitor_id = teams.iloc[i]['TEAM_ID']
        
#Convert game_date to datetime format    
game_date = datetime.strptime(game_date, '%m/%d/%Y')

Enter Home Team Abbreviation: LAL
Enter Visitor Team Abbreviation: MIL
Enter Month, Year of Prediction Request (MM/DD/YYYY)11/11/2018


## Drop Rows to only teams and earlier dates specified

In [7]:
games = games[games['GAME_DATE_EST'] <=game_date]
#games = games[games['HOME_TEAM_ID'] == home_id]
games = games[games['VISITOR_TEAM_ID'] == visitor_id]
#games = games.drop['SEASON']
games = games.drop(['GAME_DATE_EST'], axis=1)
games.head()

Unnamed: 0,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
1489,21800187,1610612743,1610612749,2018,114,0.461,0.7,0.474,27,39,121,0.571,0.615,0.5,27,41,0
1493,21800175,1610612746,1610612749,2018,128,0.465,0.815,0.412,26,54,126,0.448,0.696,0.348,31,52,1
1512,21800166,1610612744,1610612749,2018,111,0.483,0.773,0.357,31,38,134,0.515,0.778,0.257,33,46,0
1526,21800152,1610612757,1610612749,2018,118,0.527,0.625,0.395,23,51,103,0.435,0.875,0.381,26,42,1
1563,21800115,1610612738,1610612749,2018,117,0.446,0.846,0.436,30,42,113,0.482,0.733,0.31,20,45,1


## Assign X and y data frames

In [8]:
X = games.drop("HOME_TEAM_WINS", axis=1)
y = games["HOME_TEAM_WINS"]
print(X.shape, y.shape) 

(195, 16) (195,)


## Use SciKitLearn to Split into Train / Test datasets 

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

## SciKitLearn - Linear Regression

In [10]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# Fitting our model with all of our features in X
model.fit(X, y)

score = model.score(X, y)
print(f"R2 Score: {score}")
model.coef_

R2 Score: 0.6855775163208304


array([ 2.95295403e-09, -3.67705493e-03, -1.74721349e-14, -1.23798283e-02,
        3.49994631e-02, -1.03050358e+00, -1.42138987e-01,  2.36930304e-01,
       -4.45635308e-03, -5.05737359e-03, -3.07395679e-02, -4.92568665e-01,
        1.11348094e-01, -1.06069543e-01,  9.92163855e-03, -2.80171975e-04])