In [179]:
import datetime
import pandas as pd

In [180]:
df = pd.read_csv("local/games.csv", parse_dates=["GAME_DATE_EST"], dtype={
    "HOME_TEAM_ID": str,
    "VISITOR_TEAM_ID": str,
    "GAME_ID": str,
    "SEASON": str,
    "HOME_TEAM_WINS": str
})

# Drop redundant columns and entries with null columns
df = df.drop(["GAME_STATUS_TEXT", "TEAM_ID_home", "TEAM_ID_away"], axis=1)
df = df.dropna()

# Sort by ascending date
df = df.sort_values("GAME_DATE_EST")

# Set game date as index
# df = df.set_index("GAME_DATE_EST", drop=False)

df.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
19288,2003-10-05,10300001,1610612762,1610612742,2003,90.0,0.457,0.735,0.143,23.0,41.0,85.0,0.447,0.5,0.25,20.0,38.0,1
19287,2003-10-06,10300002,1610612763,1610612749,2003,105.0,0.494,0.618,0.267,25.0,48.0,94.0,0.427,0.7,0.154,20.0,43.0,1
19280,2003-10-07,10300010,1610612764,1610612752,2003,104.0,0.506,0.677,0.455,26.0,45.0,86.0,0.38,0.852,0.188,19.0,37.0,1
19286,2003-10-07,10300009,1610612758,1610612746,2003,101.0,0.467,0.871,0.444,19.0,39.0,82.0,0.368,0.609,0.364,13.0,50.0,1
19285,2003-10-07,10300005,1610612757,1610612745,2003,104.0,0.527,0.657,0.429,22.0,33.0,80.0,0.47,0.667,0.333,10.0,37.0,1


In [181]:
# # Filter only columns that may be used during prediction
# df_training = df[["GAME_DATE_EST", "HOME_TEAM_ID", "VISITOR_TEAM_ID", "SEASON", "HOME_TEAM_WINS"]]

df_aug = df.copy()
for i in range(1,51):
    df_aug[[
        "prev_PTS_home_" + str(i),
        "prev_FG_PCT_home_" + str(i),
        "prev_FT_PCT_home_" + str(i),
        "prev_FG3_PCT_home_" + str(i),
        "prev_AST_home_" + str(i),
        "prev_REB_home_" + str(i)
    ]] = df.groupby(["HOME_TEAM_ID"]).shift(i)[[
        "PTS_home",
        "FG_PCT_home",
        "FT_PCT_home",
        "FG3_PCT_home",
        "AST_home",
        "REB_home"
    ]]
    df_aug[[
        "prev_PTS_away_" + str(i),
        "prev_FG_PCT_away_" + str(i),
        "prev_FT_PCT_away_" + str(i),
        "prev_FG3_PCT_away_" + str(i),
        "prev_AST_away_" + str(i),
        "prev_REB_away_" + str(i)
    ]] = df.groupby(["VISITOR_TEAM_ID"]).shift(i)[[
        "PTS_away",
        "FG_PCT_away",
        "FT_PCT_away",
        "FG3_PCT_away",
        "AST_away",
        "REB_away"
    ]]
    df_aug = df_aug.copy() # prevent fragemntation
    
df_aug = df_aug.dropna()
df_aug = df_aug.drop(["GAME_ID",
                      "GAME_DATE_EST",
                      "PTS_home",
                      "FG_PCT_home",
                      "FT_PCT_home",
                      "FG3_PCT_home",
                      "AST_home",
                      "REB_home",
                      "PTS_away",
                      "FG_PCT_away",
                      "FT_PCT_away",
                      "FG3_PCT_away",
                      "AST_away",
                      "REB_away"], axis=1)
df_aug.head()

Unnamed: 0,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,HOME_TEAM_WINS,prev_PTS_home_1,prev_FG_PCT_home_1,prev_FT_PCT_home_1,prev_FG3_PCT_home_1,prev_AST_home_1,prev_REB_home_1,...,prev_FT_PCT_home_50,prev_FG3_PCT_home_50,prev_AST_home_50,prev_REB_home_50,prev_PTS_away_50,prev_FG_PCT_away_50,prev_FT_PCT_away_50,prev_FG3_PCT_away_50,prev_AST_away_50,prev_REB_away_50
17908,1610612747,1610612765,2003,0,96.0,0.449,0.472,0.3,31.0,38.0,...,0.6,0.35,32.0,46.0,104.0,0.493,0.7,0.286,22.0,40.0
17907,1610612747,1610612765,2003,1,75.0,0.397,0.778,0.231,15.0,37.0,...,0.682,0.364,28.0,46.0,93.0,0.435,0.8,0.357,15.0,40.0
17905,1610612765,1610612747,2003,1,88.0,0.408,0.7,0.333,17.0,51.0,...,0.786,0.5,23.0,50.0,103.0,0.457,0.692,0.154,24.0,52.0
17904,1610612765,1610612747,2003,1,88.0,0.426,0.683,0.154,16.0,45.0,...,0.758,0.3,25.0,41.0,113.0,0.46,0.61,0.421,25.0,48.0
17858,1610612750,1610612765,2004,1,75.0,0.421,0.471,0.375,24.0,45.0,...,0.737,0.385,22.0,40.0,91.0,0.397,0.771,0.286,9.0,47.0


In [182]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [183]:
# renamed df to df_work so df stays original dataset from beginning
df_work = df_aug.copy()
# changed the convertion to int to just use the given values
df_work["HOME_TEAM_ID"] = df_work["HOME_TEAM_ID"].astype(dtype=int)
df_work["VISITOR_TEAM_ID"] = df_work["VISITOR_TEAM_ID"].astype(dtype=int)
df_work["SEASON"] = df_work["SEASON"].astype(dtype=int)
df_work["HOME_TEAM_WINS"] = df_work["HOME_TEAM_WINS"].astype(dtype=int)
df_work.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24949 entries, 17908 to 0
Columns: 604 entries, HOME_TEAM_ID to prev_REB_away_50
dtypes: float64(600), int64(4)
memory usage: 115.2 MB


In [184]:
y = df_work["HOME_TEAM_WINS"]
X = df_work.drop("HOME_TEAM_WINS", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=26)

In [185]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
y_rf_class = rf_classifier.predict(X_test)
rf_class_accuracy = accuracy_score(y_rf_class, y_test)

In [186]:
rf_class_accuracy

0.6008016032064128

In [187]:
df_red = df_work[[
    "HOME_TEAM_ID",
    "VISITOR_TEAM_ID",
    "SEASON",
    "HOME_TEAM_WINS"
]]
y = df_red["HOME_TEAM_WINS"]
X = df_red.drop("HOME_TEAM_WINS", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=26)

In [188]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
y_rf_class = rf_classifier.predict(X_test)
rf_class_accuracy = accuracy_score(y_rf_class, y_test)

In [189]:
rf_class_accuracy

0.5895791583166332