In [4]:
import datetime
import pandas as pd

In [5]:
df = pd.read_csv("local/games.csv", parse_dates=["GAME_DATE_EST"], dtype={
    "HOME_TEAM_ID": str,
    "VISITOR_TEAM_ID": str,
    "GAME_ID": str,
    "SEASON": str,
    "HOME_TEAM_WINS": str
})

# Drop redundant columns and entries with null columns
df = df.drop(["GAME_STATUS_TEXT", "TEAM_ID_home", "TEAM_ID_away"], axis=1)
df = df.dropna()

# Sort by ascending date
df = df.sort_values("GAME_DATE_EST")

# Set game date as index
# df = df.set_index("GAME_DATE_EST", drop=False)

df.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
19288,2003-10-05,10300001,1610612762,1610612742,2003,90.0,0.457,0.735,0.143,23.0,41.0,85.0,0.447,0.5,0.25,20.0,38.0,1
19287,2003-10-06,10300002,1610612763,1610612749,2003,105.0,0.494,0.618,0.267,25.0,48.0,94.0,0.427,0.7,0.154,20.0,43.0,1
19280,2003-10-07,10300010,1610612764,1610612752,2003,104.0,0.506,0.677,0.455,26.0,45.0,86.0,0.38,0.852,0.188,19.0,37.0,1
19286,2003-10-07,10300009,1610612758,1610612746,2003,101.0,0.467,0.871,0.444,19.0,39.0,82.0,0.368,0.609,0.364,13.0,50.0,1
19285,2003-10-07,10300005,1610612757,1610612745,2003,104.0,0.527,0.657,0.429,22.0,33.0,80.0,0.47,0.667,0.333,10.0,37.0,1


In [6]:
# # Filter only columns that may be used during prediction
# df_training = df[["GAME_DATE_EST", "HOME_TEAM_ID", "VISITOR_TEAM_ID", "SEASON", "HOME_TEAM_WINS"]]

df_aug = df.copy()
df_aug[[
    "prev_PTS_home",
    "prev_FG_PCT_home",
    "prev_FT_PCT_home",
    "prev_FG3_PCT_home",
    "prev_AST_home",
    "prev_REB_home"
]] = df.groupby(["HOME_TEAM_ID"]).shift(2)[[
    "PTS_home",
    "FG_PCT_home",
    "FT_PCT_home",
    "FG3_PCT_home",
    "AST_home",
    "REB_home"
]]
df_aug[[
    "prev_PTS_away",
    "prev_FG_PCT_away",
    "prev_FT_PCT_away",
    "prev_FG3_PCT_away",
    "prev_AST_away",
    "prev_REB_away"
]] = df.groupby(["VISITOR_TEAM_ID"]).shift(2)[[
    "PTS_away",
    "FG_PCT_away",
    "FT_PCT_away",
    "FG3_PCT_away",
    "AST_away",
    "REB_away"
]]
df_aug = df_aug.dropna()
df_aug = df_aug[[
    "HOME_TEAM_ID",
    "VISITOR_TEAM_ID",
    "SEASON",
    "prev_PTS_home",
    "prev_FG_PCT_home",
    "prev_FT_PCT_home",
    "prev_FG3_PCT_home",
    "prev_AST_home",
    "prev_REB_home",
    "HOME_TEAM_WINS"
]]
df_aug.head()

Unnamed: 0,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,prev_PTS_home,prev_FG_PCT_home,prev_FT_PCT_home,prev_FG3_PCT_home,prev_AST_home,prev_REB_home,HOME_TEAM_WINS
19153,1610612761,1610612764,2003,76.0,0.383,0.75,0.2,17.0,39.0,1
19129,1610612742,1610612748,2003,99.0,0.488,0.667,0.167,20.0,52.0,1
19131,1610612757,1610612755,2003,104.0,0.527,0.657,0.429,22.0,33.0,0
19130,1610612741,1610612745,2003,74.0,0.317,0.613,0.231,16.0,47.0,0
19133,1610612753,1610612752,2003,98.0,0.41,0.848,0.316,18.0,36.0,0


In [9]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [13]:
le = LabelEncoder()
df = df_aug.copy()
df[[
    "HOME_TEAM_ID",
    "VISITOR_TEAM_ID",
    "SEASON"
]] = df[[
    "HOME_TEAM_ID",
    "VISITOR_TEAM_ID",
    "SEASON"
]].apply(le.fit_transform)
df["HOME_TEAM_WINS"] = df["HOME_TEAM_WINS"].astype(dtype=int)

In [14]:
y = df["HOME_TEAM_WINS"]
X = df.drop("HOME_TEAM_WINS", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=26)

In [15]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
y_rf_class = rf_classifier.predict(X_test)
rf_class_accuracy = accuracy_score(y_rf_class, y_test)

In [16]:
rf_class_accuracy

0.5896128423040604

In [17]:
df_red = df[[
    "HOME_TEAM_ID",
    "VISITOR_TEAM_ID",
    "SEASON",
    "HOME_TEAM_WINS"
]]
y = df_red["HOME_TEAM_WINS"]
X = df_red.drop("HOME_TEAM_WINS", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=26)

In [18]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
y_rf_class = rf_classifier.predict(X_test)
rf_class_accuracy = accuracy_score(y_rf_class, y_test)

In [19]:
rf_class_accuracy

0.5913125590179414