### Baseline Models

Simple If-Then Models

 - Team with best record wins
 - Home team always wins
 - Home team wins unless they have losing home record
 - Home team wins unless visitor has won 2 of 3 last away games
 
ML Models

 - LightGBM
 - XGBoost
    

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 500)

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

# For Visualization
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path  #for Windows/Linux compatibility
DATAPATH = Path(r'data')


In [2]:
train = pd.read_csv(DATAPATH / "train.csv")
test = pd.read_csv(DATAPATH / "test.csv")

train.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS,PLAYOFF,CONFERENCE_x,G_x,W_x,L_x,W_PCT_x,HOME_W_x,HOME_L_x,HOME_W_PCT_x,ROAD_W_x,ROAD_L_x,ROAD_W_PCT_x,CONFERENCE_y,G_y,W_y,L_y,W_PCT_y,HOME_W_y,HOME_L_y,HOME_W_PCT_y,ROAD_W_y,ROAD_L_y,ROAD_W_PCT_y,TARGET,HOME_WINS_LAST_3_HOME,PTS_home_LAST_3_HOME,AWAY_WINS_LAST_3_AWAY,PTS_away_LAST_3_AWAY
0,2003-10-29,20300006,1610612740,1610612737,2003,88.0,0.324,0.7,0.16,24.0,55.0,83.0,0.398,0.737,0.214,18.0,58.0,1,0,0,1,1,0,1.0,1,0,1.0,0,0,,0,1,0,1,0.0,0,0,,0,1,0.0,1.0,2.0,91.666667,,
1,2003-10-31,20300024,1610612741,1610612737,2003,100.0,0.4,0.759,0.5,27.0,53.0,94.0,0.4,0.714,0.583,22.0,48.0,1,0,0,2,1,1,0.5,1,1,0.5,0,0,,0,2,0,2,0.0,0,0,,0,2,0.0,0.0,1.0,88.666667,,
2,2003-11-05,20300060,1610612744,1610612737,2003,99.0,0.446,0.645,0.278,23.0,52.0,72.0,0.367,0.5,0.333,19.0,43.0,1,0,1,4,2,2,0.5,2,1,0.666667,0,1,0.0,0,5,1,4,0.2,1,1,0.5,0,3,0.0,1.0,3.0,96.666667,1.0,83.0
3,2003-11-08,20300084,1610612757,1610612737,2003,90.0,0.425,0.9,0.5,28.0,41.0,83.0,0.438,0.786,0.1,21.0,45.0,1,0,1,6,3,3,0.5,3,1,0.75,0,2,0.0,0,6,1,5,0.167,1,1,0.5,0,4,0.0,1.0,3.0,88.666667,1.0,83.0
4,2003-11-09,20300089,1610612760,1610612737,2003,81.0,0.379,0.737,0.056,12.0,46.0,91.0,0.479,0.789,0.533,16.0,41.0,0,0,1,4,3,1,0.75,2,1,0.666667,1,0,1.0,0,7,2,5,0.286,1,1,0.5,1,4,0.2,0.0,1.0,96.666667,1.0,82.0


**Model - Team with best record wins**

In [3]:
# train data
predict = (train['W_PCT_x'] > train['W_PCT_y']).astype('int8')
true =  train['TARGET']

accuracy_score(true,predict), roc_auc_score(true,predict)

(0.5725690979303163, 0.5752392109878647)

In [4]:
# test data
predict = (test['W_PCT_x'] > test['W_PCT_y']).astype('int8')
true =  test['TARGET']

accuracy_score(true,predict), roc_auc_score(true,predict)

(0.5574257425742575, 0.5574603174603174)

**Model - Home team always wins**

In [5]:
# train data
n = train.shape[0]
predict = pd.Series(1, index=range(n))
true =  train['TARGET']

accuracy_score(true,predict), roc_auc_score(true,predict)

(0.5928320388770772, 0.5)

In [6]:
# test data
n = test.shape[0]
predict = pd.Series(1, index=range(n))
true =  test['TARGET']

accuracy_score(true,predict), roc_auc_score(true,predict)

(0.5544554455445545, 0.5)

**Model - Home team wins unless they have a losing home record**

In [7]:
# train data
predict = train['HOME_W_PCT_x'].apply(lambda x: 0 if x < 0.50 else 1)
true =  train['TARGET']

accuracy_score(true,predict), roc_auc_score(true,predict)

(0.6245932225452336, 0.5856594049716664)

In [8]:
# test data
predict = test['HOME_W_PCT_x'].apply(lambda x: 0 if x < 0.50 else 1)
true =  test['TARGET']

accuracy_score(true,predict), roc_auc_score(true,predict)

(0.5881188118811881, 0.570952380952381)

**Model - Home team wins unless Away team has won 2 of last three away games**

In [9]:
# train data
predict = train['AWAY_WINS_LAST_3_AWAY'].apply(lambda x: 0 if x > 1 else 1)
true =  train['TARGET']

accuracy_score(true,predict), roc_auc_score(true,predict)

(0.7604026554432247, 0.7433715958913312)

In [10]:
# test data
predict = test['AWAY_WINS_LAST_3_AWAY'].apply(lambda x: 0 if x > 1 else 1)
true =  test['TARGET']

accuracy_score(true,predict), roc_auc_score(true,predict)

(0.7504950495049505, 0.7440079365079365)

### ML Baseline Models

**Encoding / Smaller memory footprint**

In [11]:
def encode_data(df):
    df['GAME_DATE_EST'] = pd.to_datetime(df['GAME_DATE_EST'])

    category_fields = ['GAME_ID', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'SEASON', 'HOME_TEAM_WINS', 'PLAYOFF', 'CONFERENCE_x', 'CONFERENCE_y', 'TARGET']

    long_integer_fields = ['GAME_ID', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID']

    #convert long integer fields to int32 from int64
    for field in long_integer_fields:
        df[field] = df[field].astype('int32')
    
    #convert the remaining int64s to int8
    for field in df.select_dtypes(include=['int64']).columns.tolist():
        df[field] = df[field].astype('int8')
        
    #convert float64s to float16s
    for field in df.select_dtypes(include=['float64']).columns.tolist():
        df[field] = df[field].astype('float16')
        
    return df

train = encode_data(train)
test = encode_data(test)
