In [284]:
# Import dependencies
import pandas as pd 
import os

import sqlalchemy as db
from sqlalchemy import create_engine

from config import db_password
import psycopg2 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

### Processing for SQL Database

In [285]:
# Files to load
games_data_to_load = os.path.join("Resources" , "games.csv")
game_details_data_to_load = os.path.join("Resources" , "game_details.csv")

In [286]:
# Read the games data file and store it in a Pandas DataFrame
games_df = pd.read_csv(games_data_to_load)
games_df.head(5)

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,3/1/2020,21900895,Final,1610612766,1610612749,2019,1610612766,85.0,0.354,0.9,...,22.0,47.0,1610612749,93.0,0.402,0.762,0.226,20.0,61.0,0
1,3/1/2020,21900896,Final,1610612750,1610612742,2019,1610612750,91.0,0.364,0.4,...,19.0,57.0,1610612742,111.0,0.468,0.632,0.275,28.0,56.0,0
2,3/1/2020,21900897,Final,1610612746,1610612755,2019,1610612746,136.0,0.592,0.805,...,25.0,37.0,1610612755,130.0,0.505,0.65,0.488,27.0,37.0,1
3,3/1/2020,21900898,Final,1610612743,1610612761,2019,1610612743,133.0,0.566,0.7,...,38.0,41.0,1610612761,118.0,0.461,0.897,0.263,24.0,36.0,1
4,3/1/2020,21900899,Final,1610612758,1610612765,2019,1610612758,106.0,0.407,0.885,...,18.0,51.0,1610612765,100.0,0.413,0.667,0.429,23.0,42.0,1


In [287]:
# Check data types to ensure they can be plugged into a ML model
games_df.dtypes

GAME_DATE_EST        object
GAME_ID               int64
GAME_STATUS_TEXT     object
HOME_TEAM_ID          int64
VISITOR_TEAM_ID       int64
SEASON                int64
TEAM_ID_home          int64
PTS_home            float64
FG_PCT_home         float64
FT_PCT_home         float64
FG3_PCT_home        float64
AST_home            float64
REB_home            float64
TEAM_ID_away          int64
PTS_away            float64
FG_PCT_away         float64
FT_PCT_away         float64
FG3_PCT_away        float64
AST_away            float64
REB_away            float64
HOME_TEAM_WINS        int64
dtype: object

In [288]:
# Check for NaN rows
games_df.isna().sum()

GAME_DATE_EST        0
GAME_ID              0
GAME_STATUS_TEXT     0
HOME_TEAM_ID         0
VISITOR_TEAM_ID      0
SEASON               0
TEAM_ID_home         0
PTS_home            99
FG_PCT_home         99
FT_PCT_home         99
FG3_PCT_home        99
AST_home            99
REB_home            99
TEAM_ID_away         0
PTS_away            99
FG_PCT_away         99
FT_PCT_away         99
FG3_PCT_away        99
AST_away            99
REB_away            99
HOME_TEAM_WINS       0
dtype: int64

In [289]:
# Drop rows that have NaN values and confirm that all have been dropped
games_df = games_df.dropna()
games_df.isna().sum()

GAME_DATE_EST       0
GAME_ID             0
GAME_STATUS_TEXT    0
HOME_TEAM_ID        0
VISITOR_TEAM_ID     0
SEASON              0
TEAM_ID_home        0
PTS_home            0
FG_PCT_home         0
FT_PCT_home         0
FG3_PCT_home        0
AST_home            0
REB_home            0
TEAM_ID_away        0
PTS_away            0
FG_PCT_away         0
FT_PCT_away         0
FG3_PCT_away        0
AST_away            0
REB_away            0
HOME_TEAM_WINS      0
dtype: int64

In [290]:
# Drop unneccessary columns (ie. column duplicates, any columns that contain unneccessary text)
games_df = games_df.drop(columns=['GAME_DATE_EST', 'GAME_STATUS_TEXT', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID'])

In [291]:
games_df

Unnamed: 0,GAME_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,21900895,2019,1610612766,85.0,0.354,0.900,0.229,22.0,47.0,1610612749,93.0,0.402,0.762,0.226,20.0,61.0,0
1,21900896,2019,1610612750,91.0,0.364,0.400,0.310,19.0,57.0,1610612742,111.0,0.468,0.632,0.275,28.0,56.0,0
2,21900897,2019,1610612746,136.0,0.592,0.805,0.542,25.0,37.0,1610612755,130.0,0.505,0.650,0.488,27.0,37.0,1
3,21900898,2019,1610612743,133.0,0.566,0.700,0.500,38.0,41.0,1610612761,118.0,0.461,0.897,0.263,24.0,36.0,1
4,21900899,2019,1610612758,106.0,0.407,0.885,0.257,18.0,51.0,1610612765,100.0,0.413,0.667,0.429,23.0,42.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23190,11400007,2014,1610612737,93.0,0.419,0.821,0.421,24.0,50.0,1610612740,87.0,0.366,0.643,0.375,17.0,43.0,1
23191,11400004,2014,1610612741,81.0,0.338,0.719,0.381,18.0,40.0,1610612764,85.0,0.411,0.636,0.267,17.0,47.0,0
23192,11400005,2014,1610612747,98.0,0.448,0.682,0.500,29.0,45.0,1610612743,95.0,0.387,0.659,0.500,19.0,43.0,1
23193,11400002,2014,1610612761,99.0,0.440,0.771,0.333,21.0,30.0,1610612758,94.0,0.469,0.725,0.385,18.0,45.0,1


In [292]:
# Read the game details data file and store it in a Pandas DataFrame
game_details_df = pd.read_csv(game_details_data_to_load)
game_details_df

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,START_POSITION,COMMENT,MIN,FGM,...,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS
0,21900895,1610612749,MIL,Milwaukee,202083,Wesley Matthews,F,,27:08:00,3.0,...,4.0,4.0,8.0,2.0,2.0,0.0,0.0,0.0,8.0,11.0
1,21900895,1610612749,MIL,Milwaukee,203507,Giannis Antetokounmpo,F,,34:55:00,17.0,...,2.0,18.0,20.0,6.0,1.0,0.0,3.0,2.0,41.0,22.0
2,21900895,1610612749,MIL,Milwaukee,201572,Brook Lopez,C,,26:25:00,4.0,...,2.0,5.0,7.0,0.0,0.0,3.0,0.0,2.0,16.0,16.0
3,21900895,1610612749,MIL,Milwaukee,1628978,Donte DiVincenzo,G,,27:35:00,1.0,...,1.0,6.0,7.0,5.0,0.0,1.0,2.0,0.0,2.0,14.0
4,21900895,1610612749,MIL,Milwaukee,202339,Eric Bledsoe,G,,22:17,2.0,...,1.0,0.0,1.0,2.0,1.0,0.0,3.0,2.0,4.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
576777,11200005,1610612743,DEN,Denver,202706,Jordan Hamilton,,,19,4.0,...,0.0,2.0,2.0,0.0,2.0,0.0,1.0,3.0,17.0,
576778,11200005,1610612743,DEN,Denver,202702,Kenneth Faried,,,23,7.0,...,1.0,0.0,1.0,1.0,1.0,0.0,3.0,3.0,18.0,
576779,11200005,1610612743,DEN,Denver,201585,Kosta Koufos,,,15,3.0,...,3.0,5.0,8.0,0.0,1.0,0.0,0.0,3.0,6.0,
576780,11200005,1610612743,DEN,Denver,202389,Timofey Mozgov,,,19,1.0,...,1.0,2.0,3.0,1.0,0.0,0.0,4.0,2.0,2.0,


In [293]:
# Check data types to ensure they can be plugged into a ML model
game_details_df.dtypes

GAME_ID                int64
TEAM_ID                int64
TEAM_ABBREVIATION     object
TEAM_CITY             object
PLAYER_ID              int64
PLAYER_NAME           object
START_POSITION        object
COMMENT               object
MIN                   object
FGM                  float64
FGA                  float64
FG_PCT               float64
FG3M                 float64
FG3A                 float64
FG3_PCT              float64
FTM                  float64
FTA                  float64
FT_PCT               float64
OREB                 float64
DREB                 float64
REB                  float64
AST                  float64
STL                  float64
BLK                  float64
TO                   float64
PF                   float64
PTS                  float64
PLUS_MINUS           float64
dtype: object

In [294]:
# Drop unneccessary columns 
game_details_df = game_details_df.drop(columns=['TEAM_ABBREVIATION', 'START_POSITION', 'COMMENT', 'MIN', 'PLUS_MINUS'])
game_details_df

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_CITY,PLAYER_ID,PLAYER_NAME,FGM,FGA,FG_PCT,FG3M,FG3A,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS
0,21900895,1610612749,Milwaukee,202083,Wesley Matthews,3.0,11.0,0.273,2.0,7.0,...,0.000,4.0,4.0,8.0,2.0,2.0,0.0,0.0,0.0,8.0
1,21900895,1610612749,Milwaukee,203507,Giannis Antetokounmpo,17.0,28.0,0.607,1.0,4.0,...,0.857,2.0,18.0,20.0,6.0,1.0,0.0,3.0,2.0,41.0
2,21900895,1610612749,Milwaukee,201572,Brook Lopez,4.0,11.0,0.364,1.0,5.0,...,0.778,2.0,5.0,7.0,0.0,0.0,3.0,0.0,2.0,16.0
3,21900895,1610612749,Milwaukee,1628978,Donte DiVincenzo,1.0,5.0,0.200,0.0,3.0,...,0.000,1.0,6.0,7.0,5.0,0.0,1.0,2.0,0.0,2.0
4,21900895,1610612749,Milwaukee,202339,Eric Bledsoe,2.0,8.0,0.250,0.0,1.0,...,0.000,1.0,0.0,1.0,2.0,1.0,0.0,3.0,2.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
576777,11200005,1610612743,Denver,202706,Jordan Hamilton,4.0,9.0,0.444,3.0,6.0,...,0.857,0.0,2.0,2.0,0.0,2.0,0.0,1.0,3.0,17.0
576778,11200005,1610612743,Denver,202702,Kenneth Faried,7.0,11.0,0.636,0.0,0.0,...,1.000,1.0,0.0,1.0,1.0,1.0,0.0,3.0,3.0,18.0
576779,11200005,1610612743,Denver,201585,Kosta Koufos,3.0,7.0,0.429,0.0,0.0,...,0.000,3.0,5.0,8.0,0.0,1.0,0.0,0.0,3.0,6.0
576780,11200005,1610612743,Denver,202389,Timofey Mozgov,1.0,1.0,1.000,0.0,0.0,...,0.000,1.0,2.0,3.0,1.0,0.0,0.0,4.0,2.0,2.0


In [295]:
# Check for NaN rows (these will be for players that did not play for that game)
game_details_df.isna().sum()

GAME_ID            0
TEAM_ID            0
TEAM_CITY          0
PLAYER_ID          0
PLAYER_NAME        0
FGM            92261
FGA            92261
FG_PCT         92261
FG3M           92261
FG3A           92261
FG3_PCT        92261
FTM            92261
FTA            92261
FT_PCT         92261
OREB           92261
DREB           92261
REB            92261
AST            92261
STL            92261
BLK            92261
TO             92261
PF             92261
PTS            92261
dtype: int64

In [296]:
# Drop rows that have NaN values and confirm that all have been dropped
game_details_df = game_details_df.dropna()
game_details_df.isna().sum()

GAME_ID        0
TEAM_ID        0
TEAM_CITY      0
PLAYER_ID      0
PLAYER_NAME    0
FGM            0
FGA            0
FG_PCT         0
FG3M           0
FG3A           0
FG3_PCT        0
FTM            0
FTA            0
FT_PCT         0
OREB           0
DREB           0
REB            0
AST            0
STL            0
BLK            0
TO             0
PF             0
PTS            0
dtype: int64

In [297]:
game_details_df

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_CITY,PLAYER_ID,PLAYER_NAME,FGM,FGA,FG_PCT,FG3M,FG3A,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS
0,21900895,1610612749,Milwaukee,202083,Wesley Matthews,3.0,11.0,0.273,2.0,7.0,...,0.000,4.0,4.0,8.0,2.0,2.0,0.0,0.0,0.0,8.0
1,21900895,1610612749,Milwaukee,203507,Giannis Antetokounmpo,17.0,28.0,0.607,1.0,4.0,...,0.857,2.0,18.0,20.0,6.0,1.0,0.0,3.0,2.0,41.0
2,21900895,1610612749,Milwaukee,201572,Brook Lopez,4.0,11.0,0.364,1.0,5.0,...,0.778,2.0,5.0,7.0,0.0,0.0,3.0,0.0,2.0,16.0
3,21900895,1610612749,Milwaukee,1628978,Donte DiVincenzo,1.0,5.0,0.200,0.0,3.0,...,0.000,1.0,6.0,7.0,5.0,0.0,1.0,2.0,0.0,2.0
4,21900895,1610612749,Milwaukee,202339,Eric Bledsoe,2.0,8.0,0.250,0.0,1.0,...,0.000,1.0,0.0,1.0,2.0,1.0,0.0,3.0,2.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
576777,11200005,1610612743,Denver,202706,Jordan Hamilton,4.0,9.0,0.444,3.0,6.0,...,0.857,0.0,2.0,2.0,0.0,2.0,0.0,1.0,3.0,17.0
576778,11200005,1610612743,Denver,202702,Kenneth Faried,7.0,11.0,0.636,0.0,0.0,...,1.000,1.0,0.0,1.0,1.0,1.0,0.0,3.0,3.0,18.0
576779,11200005,1610612743,Denver,201585,Kosta Koufos,3.0,7.0,0.429,0.0,0.0,...,0.000,3.0,5.0,8.0,0.0,1.0,0.0,0.0,3.0,6.0
576780,11200005,1610612743,Denver,202389,Timofey Mozgov,1.0,1.0,1.000,0.0,0.0,...,0.000,1.0,2.0,3.0,1.0,0.0,0.0,4.0,2.0,2.0


In [298]:
# Export clean DataFrames to CSVs
games_df.to_csv (r'C:\Users\christian.hargett\NBA_Analysis_Final_Project\Resources\games_clean.csv', index = False, header=True)
game_details_df.to_csv (r'C:\Users\christian.hargett\NBA_Analysis_Final_Project\Resources\game_details_clean.csv', index = False, header=True)

### Processing for R - Regressions

In [299]:
# Read the NBA_Data_clean and store it in a Pandas DataFram
NBA_data_df = pd.read_csv("Resources/NBA_project_data.csv")
NBA_data_df.head()

Unnamed: 0,game_id,season,team_id_home,pts_home,fg_pct_home,ft_pct_home,fg3_pct_home,ast_home,reb_home,team_id_away,...,ft_pct,oreb,dreb,reb,ast,stl,blk,tos,pf,pts
0,21900895,2019,1610612766,85,0.354,0.9,0.229,22,47,1610612749,...,0.0,4,4,8,2,2,0,0,0,8
1,21900895,2019,1610612766,85,0.354,0.9,0.229,22,47,1610612749,...,0.857,2,18,20,6,1,0,3,2,41
2,21900895,2019,1610612766,85,0.354,0.9,0.229,22,47,1610612749,...,0.778,2,5,7,0,0,3,0,2,16
3,21900895,2019,1610612766,85,0.354,0.9,0.229,22,47,1610612749,...,0.0,1,6,7,5,0,1,2,0,2
4,21900895,2019,1610612766,85,0.354,0.9,0.229,22,47,1610612749,...,0.0,1,0,1,2,1,0,3,2,4


In [300]:
NBA_data_df.columns

Index(['game_id', 'season', 'team_id_home', 'pts_home', 'fg_pct_home',
       'ft_pct_home', 'fg3_pct_home', 'ast_home', 'reb_home', 'team_id_away',
       'pts_away', 'fg_pct_away', 'ft_pct_away', 'fg3_pct_away', 'ast_away',
       'reb_away', 'home_team_wins', 'team_id', 'team_city', 'player_id',
       'player_name', 'fgm', 'fga', 'fg_pct', 'fg3m', 'fg3a', 'fg3_pct', 'ftm',
       'fta', 'ft_pct', 'oreb', 'dreb', 'reb', 'ast', 'stl', 'blk', 'tos',
       'pf', 'pts'],
      dtype='object')

In [301]:
NBA_data_df.dtypes

game_id             int64
season              int64
team_id_home        int64
pts_home            int64
fg_pct_home       float64
ft_pct_home       float64
fg3_pct_home      float64
ast_home            int64
reb_home            int64
team_id_away        int64
pts_away            int64
fg_pct_away       float64
ft_pct_away       float64
fg3_pct_away      float64
ast_away            int64
reb_away            int64
home_team_wins      int64
team_id             int64
team_city          object
player_id           int64
player_name        object
fgm                 int64
fga                 int64
fg_pct            float64
fg3m                int64
fg3a                int64
fg3_pct           float64
ftm                 int64
fta                 int64
ft_pct            float64
oreb                int64
dreb                int64
reb                 int64
ast                 int64
stl                 int64
blk                 int64
tos                 int64
pf                  int64
pts         

In [302]:
NBA_data_df.count()

game_id           484521
season            484521
team_id_home      484521
pts_home          484521
fg_pct_home       484521
ft_pct_home       484521
fg3_pct_home      484521
ast_home          484521
reb_home          484521
team_id_away      484521
pts_away          484521
fg_pct_away       484521
ft_pct_away       484521
fg3_pct_away      484521
ast_away          484521
reb_away          484521
home_team_wins    484521
team_id           484521
team_city         484521
player_id         484521
player_name       484521
fgm               484521
fga               484521
fg_pct            484521
fg3m              484521
fg3a              484521
fg3_pct           484521
ftm               484521
fta               484521
ft_pct            484521
oreb              484521
dreb              484521
reb               484521
ast               484521
stl               484521
blk               484521
tos               484521
pf                484521
pts               484521
dtype: int64

In [303]:
# Deleting columns for R regression analysis
NBA_data_Regres_df = NBA_data_df.drop(['game_id', 'team_id_home','team_id_away', 'team_id', 'team_city', 'player_id','player_name'], axis=1)
NBA_data_Regres_df.head(30)

Unnamed: 0,season,pts_home,fg_pct_home,ft_pct_home,fg3_pct_home,ast_home,reb_home,pts_away,fg_pct_away,ft_pct_away,...,ft_pct,oreb,dreb,reb,ast,stl,blk,tos,pf,pts
0,2019,85,0.354,0.9,0.229,22,47,93,0.402,0.762,...,0.0,4,4,8,2,2,0,0,0,8
1,2019,85,0.354,0.9,0.229,22,47,93,0.402,0.762,...,0.857,2,18,20,6,1,0,3,2,41
2,2019,85,0.354,0.9,0.229,22,47,93,0.402,0.762,...,0.778,2,5,7,0,0,3,0,2,16
3,2019,85,0.354,0.9,0.229,22,47,93,0.402,0.762,...,0.0,1,6,7,5,0,1,2,0,2
4,2019,85,0.354,0.9,0.229,22,47,93,0.402,0.762,...,0.0,1,0,1,2,1,0,3,2,4
5,2019,85,0.354,0.9,0.229,22,47,93,0.402,0.762,...,0.5,2,3,5,1,0,0,1,2,6
6,2019,85,0.354,0.9,0.229,22,47,93,0.402,0.762,...,0.0,1,2,3,0,0,1,2,1,2
7,2019,85,0.354,0.9,0.229,22,47,93,0.402,0.762,...,0.0,0,3,3,0,0,0,1,0,3
8,2019,85,0.354,0.9,0.229,22,47,93,0.402,0.762,...,0.0,0,2,2,2,1,1,1,1,0
9,2019,85,0.354,0.9,0.229,22,47,93,0.402,0.762,...,0.667,2,3,5,2,2,0,3,1,11


In [304]:
NBA_data_Regres_df.columns

Index(['season', 'pts_home', 'fg_pct_home', 'ft_pct_home', 'fg3_pct_home',
       'ast_home', 'reb_home', 'pts_away', 'fg_pct_away', 'ft_pct_away',
       'fg3_pct_away', 'ast_away', 'reb_away', 'home_team_wins', 'fgm', 'fga',
       'fg_pct', 'fg3m', 'fg3a', 'fg3_pct', 'ftm', 'fta', 'ft_pct', 'oreb',
       'dreb', 'reb', 'ast', 'stl', 'blk', 'tos', 'pf', 'pts'],
      dtype='object')

In [305]:
NBA_data_Regres_df.to_csv(r'C:\Users\christian.hargett\NBA_Analysis_Final_Project\Resources\NBA_Data_R.csv', index=False)

### Connecting to our Database and Preparing our Data for Modelling

In [306]:
#Create string to use to connect to database
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/NBA_Analysis_Final_Project"

In [307]:
# Start the database engine
engine = db.create_engine(db_string)
connection = engine.connect()
metadata = db.MetaData()

In [308]:
# Load the "games" table
games_data = db.Table('games', metadata, autoload=True, autoload_with=engine)

In [309]:
# Query for the data
query = db.select([games_data])

In [310]:
# Use the connection function to return the "games" data
results = connection.execute(query).fetchall()

In [311]:
# Transform the queried data into a DataFrame
games_df = pd.DataFrame(results)
games_df.columns = results[0].keys()

games_df.head()

Unnamed: 0,game_id,season,team_id_home,pts_home,fg_pct_home,ft_pct_home,fg3_pct_home,ast_home,reb_home,team_id_away,pts_away,fg_pct_away,ft_pct_away,fg3_pct_away,ast_away,reb_away,home_team_wins
0,21900895,2019,1610612766,85.0,0.354,0.9,0.2289999999999999,22.0,47.0,1610612749,93.0,0.402,0.762,0.226,20.0,61.0,0
1,21900896,2019,1610612750,91.0,0.364,0.4,0.31,19.0,57.0,1610612742,111.0,0.4679999999999999,0.632,0.275,28.0,56.0,0
2,21900897,2019,1610612746,136.0,0.5920000000000001,0.805,0.542,25.0,37.0,1610612755,130.0,0.505,0.65,0.488,27.0,37.0,1
3,21900898,2019,1610612743,133.0,0.5660000000000001,0.7,0.5,38.0,41.0,1610612761,118.0,0.461,0.897,0.263,24.0,36.0,1
4,21900899,2019,1610612758,106.0,0.407,0.885,0.257,18.0,51.0,1610612765,100.0,0.413,0.667,0.429,23.0,42.0,1


In [312]:
# Check data types
games_df.dtypes

game_id            object
season             object
team_id_home       object
pts_home          float64
fg_pct_home        object
ft_pct_home        object
fg3_pct_home       object
ast_home          float64
reb_home          float64
team_id_away       object
pts_away          float64
fg_pct_away        object
ft_pct_away        object
fg3_pct_away       object
ast_away          float64
reb_away          float64
home_team_wins     object
dtype: object

In [313]:
# Convert data types to floats
games_df['fg_pct_home'] = games_df['fg_pct_home'].astype(float)
games_df['ft_pct_home'] = games_df['ft_pct_home'].astype(float)
games_df['fg3_pct_home'] = games_df['fg3_pct_home'].astype(float)
games_df['fg_pct_away'] = games_df['fg_pct_away'].astype(float)
games_df['ft_pct_away'] = games_df['ft_pct_away'].astype(float)
games_df['fg3_pct_away'] = games_df['fg3_pct_away'].astype(float)

In [314]:
games_df

Unnamed: 0,game_id,season,team_id_home,pts_home,fg_pct_home,ft_pct_home,fg3_pct_home,ast_home,reb_home,team_id_away,pts_away,fg_pct_away,ft_pct_away,fg3_pct_away,ast_away,reb_away,home_team_wins
0,21900895,2019,1610612766,85.0,0.354,0.900,0.229,22.0,47.0,1610612749,93.0,0.402,0.762,0.226,20.0,61.0,0
1,21900896,2019,1610612750,91.0,0.364,0.400,0.310,19.0,57.0,1610612742,111.0,0.468,0.632,0.275,28.0,56.0,0
2,21900897,2019,1610612746,136.0,0.592,0.805,0.542,25.0,37.0,1610612755,130.0,0.505,0.650,0.488,27.0,37.0,1
3,21900898,2019,1610612743,133.0,0.566,0.700,0.500,38.0,41.0,1610612761,118.0,0.461,0.897,0.263,24.0,36.0,1
4,21900899,2019,1610612758,106.0,0.407,0.885,0.257,18.0,51.0,1610612765,100.0,0.413,0.667,0.429,23.0,42.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23091,11400007,2014,1610612737,93.0,0.419,0.821,0.421,24.0,50.0,1610612740,87.0,0.366,0.643,0.375,17.0,43.0,1
23092,11400004,2014,1610612741,81.0,0.338,0.719,0.381,18.0,40.0,1610612764,85.0,0.411,0.636,0.267,17.0,47.0,0
23093,11400005,2014,1610612747,98.0,0.448,0.682,0.500,29.0,45.0,1610612743,95.0,0.387,0.659,0.500,19.0,43.0,1
23094,11400002,2014,1610612761,99.0,0.440,0.771,0.333,21.0,30.0,1610612758,94.0,0.469,0.725,0.385,18.0,45.0,1


In [315]:
# Determine which columns in our original games_df will be used for the ML model
model_df_columns = [ 'fg_pct_home', 
                    'ft_pct_home', 
                    'fg3_pct_home', 
                    'ast_home', 
                    'reb_home',  
                    'fg_pct_away', 
                    'ft_pct_away', 
                    'fg3_pct_away', 
                    'ast_away', 
                    'reb_away', 
                   'home_team_wins']

model_df = games_df[model_df_columns]
model_df.head()

Unnamed: 0,fg_pct_home,ft_pct_home,fg3_pct_home,ast_home,reb_home,fg_pct_away,ft_pct_away,fg3_pct_away,ast_away,reb_away,home_team_wins
0,0.354,0.9,0.229,22.0,47.0,0.402,0.762,0.226,20.0,61.0,0
1,0.364,0.4,0.31,19.0,57.0,0.468,0.632,0.275,28.0,56.0,0
2,0.592,0.805,0.542,25.0,37.0,0.505,0.65,0.488,27.0,37.0,1
3,0.566,0.7,0.5,38.0,41.0,0.461,0.897,0.263,24.0,36.0,1
4,0.407,0.885,0.257,18.0,51.0,0.413,0.667,0.429,23.0,42.0,1


In [316]:
# Separate dataset into features (X) and target (y)
y = model_df["home_team_wins"]
X = model_df.drop(columns="home_team_wins")

In [317]:
# Scale the data
X = MinMaxScaler().fit_transform(X)
X

array([[0.23963134, 0.88331389, 0.229     , ..., 0.226     , 0.38095238,
        0.67741935],
       [0.26267281, 0.29988331, 0.31      , ..., 0.275     , 0.57142857,
        0.59677419],
       [0.78801843, 0.77246208, 0.542     , ..., 0.488     , 0.54761905,
        0.29032258],
       ...,
       [0.4562212 , 0.62893816, 0.5       , ..., 0.5       , 0.35714286,
        0.38709677],
       [0.43778802, 0.7327888 , 0.333     , ..., 0.385     , 0.33333333,
        0.41935484],
       [0.41705069, 0.62543757, 0.333     , ..., 0.438     , 0.35714286,
        0.37096774]])

In [318]:
# Split dataset into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
   y,  random_state=1, stratify=y)

## SVC Model

In [319]:
# Import the SVC model
svcmodel = SVC(kernel='linear')

# Train the model
svcmodel.fit(X_train, y_train)

SVC(kernel='linear')

In [320]:
# Create predictions on the SVC model
y_pred_svc = svcmodel.predict(X_test)

In [321]:
# Test accuracy score
accuracy_score(y_test, y_pred_svc)

0.8326983027364045

In [322]:
# Generate a confusion matrix
confusion_matrix(y_test, y_pred_svc)

array([[1827,  512],
       [ 454, 2981]], dtype=int64)

In [323]:
# Print the classification report
print(classification_report(y_test, y_pred_svc))

              precision    recall  f1-score   support

           0       0.80      0.78      0.79      2339
           1       0.85      0.87      0.86      3435

    accuracy                           0.83      5774
   macro avg       0.83      0.82      0.83      5774
weighted avg       0.83      0.83      0.83      5774



## Logistic Regression

In [324]:
# Import the logistic regression model
lrmodel = LogisticRegression(solver='lbfgs',max_iter=200, random_state=1)

lrmodel.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [325]:
# Create predictions on the logistic regression model
y_pred_lr = lrmodel.predict(X_test)

In [326]:
# Test accuracy score
accuracy_score(y_test, y_pred_lr)

0.8332178732248008

In [327]:
# Generate a confusion matrix
confusion_matrix(y_test, y_pred_lr)

array([[1819,  520],
       [ 443, 2992]], dtype=int64)

In [328]:
# Print the classification report
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.80      0.78      0.79      2339
           1       0.85      0.87      0.86      3435

    accuracy                           0.83      5774
   macro avg       0.83      0.82      0.83      5774
weighted avg       0.83      0.83      0.83      5774



## Random Forest

In [329]:
# Create a random forest classifier.
rfmodel = RandomForestClassifier(n_estimators=128, random_state=1) 

# Train the model
rfmodel.fit(X_train, y_train)

RandomForestClassifier(n_estimators=128, random_state=1)

In [330]:
# Create predictions on the random forest classifier
y_pred_rf = rfmodel.predict(X_test)

In [331]:
# Test accuracy score
accuracy_score(y_test, y_pred_rf)

0.8273294076896432

In [332]:
# Generate a confusion matrix
confusion_matrix(y_test, y_pred_rf)

array([[1809,  530],
       [ 467, 2968]], dtype=int64)

In [333]:
# Print the classification report
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78      2339
           1       0.85      0.86      0.86      3435

    accuracy                           0.83      5774
   macro avg       0.82      0.82      0.82      5774
weighted avg       0.83      0.83      0.83      5774



## Easy Ensemble AdaBoost Classifier

In [334]:
# Create an Easy Ensemble AdaBoost classifier.
eamodel = EasyEnsembleClassifier(n_estimators=100,random_state=1)

# Train the model
eamodel.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [335]:
# Create predictions on the Easy Ensemble AdaBoost classifier
y_pred_ea = eamodel.predict(X_test)

In [336]:
# Test accuracy score
accuracy_score(y_test, y_pred_ea)

0.826117076550052

In [337]:
# Generate a confusion matrix
confusion_matrix(y_test, y_pred_ea)

array([[1958,  381],
       [ 623, 2812]], dtype=int64)

In [338]:
# Print the classification report
print(classification_report(y_test, y_pred_ea))

              precision    recall  f1-score   support

           0       0.76      0.84      0.80      2339
           1       0.88      0.82      0.85      3435

    accuracy                           0.83      5774
   macro avg       0.82      0.83      0.82      5774
weighted avg       0.83      0.83      0.83      5774

