Total Game Score Model - GLM

In [None]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.feature_selection import RFECV

pd.options.display.max_rows = 100
pd.options.display.max_columns = 999


Load Data

In [None]:
data = pd.read_csv("/total-points-score-model/data/modelling-data/modelling_data_total_team_score.csv")

In [None]:
data.head(1)

Clean Data

In [None]:
data.isnull().sum()

In [None]:
modelling_data = data[data['Date'] < "2019-01-01"]
oot_data = data[data['Date'] >= "2019-01-01"]

In [None]:
modelling_data.shape, oot_data.shape

In [None]:
training_data = modelling_data[modelling_data['training_set'] == True]
test_data = modelling_data[modelling_data['training_set'] == False]

In [None]:
training_data.shape, test_data.shape

Train Model

In [None]:
response = "Total_Game_Score"

In [None]:
list(training_data)

In [None]:
modelling_features = [
    'Venue',
    'Year',
    'Home_Team',
    'Away_Team',
    'Home_Total_Game_Score_avg2',
    'Home_For_Goals_avg2',
    'Home_For_Behinds_avg2',
    'Home_For_Scores_avg2',
    'Home_For_Total_Score_avg2',
    'Home_Against_Goals_avg2',
    'Home_Against_Behinds_avg2',
    'Home_Against_Scores_avg2',
    'Home_Against_Total_Score_avg2',
    'Home_Total_Game_Score_avg10',
    'Home_For_Goals_avg10',
    'Home_For_Behinds_avg10',
    'Home_For_Scores_avg10',
    'Home_For_Total_Score_avg10',
    'Home_Against_Goals_avg10',
    'Home_Against_Behinds_avg10',
    'Home_Against_Scores_avg10',
    'Home_Against_Total_Score_avg10',
    'Away_Total_Game_Score_avg2',
    'Away_For_Goals_avg2',
    'Away_For_Behinds_avg2',
    'Away_For_Scores_avg2',
    'Away_For_Total_Score_avg2',
    'Away_Against_Goals_avg2',
    'Away_Against_Behinds_avg2',
    'Away_Against_Scores_avg2',
    'Away_Against_Total_Score_avg2',
    'Away_Total_Game_Score_avg10',
    'Away_For_Goals_avg10',
    'Away_For_Behinds_avg10',
    'Away_For_Scores_avg10',
    'Away_For_Total_Score_avg10',
    'Away_Against_Goals_avg10',
    'Away_Against_Behinds_avg10',
    'Away_Against_Scores_avg10',
    'Away_Against_Total_Score_avg10']

In [None]:
X, y = training_data[modelling_features], training_data[response]
X_test, y_test = test_data[modelling_features], test_data[response]
X_oot, y_oot = oot_data[modelling_features], oot_data[response]

In [None]:
X_dummies = pd.get_dummies(X)
X_test_dummies = pd.get_dummies(X_test)
X_oot_dummies = pd.get_dummies(X_oot)

for col in list(X_dummies):
    if col not in list(X_test_dummies):
        X_test_dummies[col] = 0
        
for col in list(X_dummies):
    if col not in list(X_oot_dummies):
        X_oot_dummies[col] = 0

In [None]:
X_dummies.shape, X_test_dummies.shape, X_oot_dummies.shape

Feature Selection

In [None]:
linear_regression = linear_model.LinearRegression()

In [None]:
selector = RFECV(linear_regression, step=1, cv=5, min_features_to_select = 20)

In [None]:
selector = selector.fit(X_dummies, y)

In [None]:
selector.n_features_

In [None]:
selected_features = list(selector.get_feature_names_out())
selected_features

In [None]:
model = linear_regression.fit(X_dummies[selected_features], y)

In [None]:
coefficients = pd.DataFrame({"Feature":X_dummies[selected_features].columns,"Coefficients":np.transpose(model.coef_)})

In [None]:
coefficients.sort_values(by = "Coefficients", ascending = False)

Evaluate Model

In [None]:
X_train_selected = X_dummies[selected_features]
X_test_selected = X_test_dummies[selected_features]
X_oot_selected = X_oot_dummies[selected_features]

In [None]:
test_preds = model.predict(X_test_selected)

In [None]:
def MAE(data, model, actuals):

    preds = model.predict(data)
    absolute_error = abs(preds - actuals)
    mae = np.mean(absolute_error)

    return mae

In [None]:
def RMSE(data, model, actuals):

    preds = model.predict(data)
    squared_error = (preds - actuals)**2
    mean_squared_error = np.mean(squared_error)
    rmse = (mean_squared_error)**0.5

    return rmse

In [None]:
def get_error_metrics(data, model, actuals):

    mae = MAE(data, model, actuals)
    rmse = RMSE(data, model, actuals)

    print("MAE: {}".format(mae))
    print("RMSE: {}".format(rmse))

    return mae, rmse

In [None]:
X_dummies_selected = X_dummies[selected_features]
get_error_metrics(X_dummies_selected, model, y)

In [None]:
get_error_metrics(X_test_selected, model, y_test)

In [None]:
get_error_metrics(X_oot_selected, model, y_oot)

Save model predictions

In [None]:
data_features = data[selected_features]
data_dummies = pd.get_dummies(data_features)
data_response = data[response]

data['linear_preds'] = model.predict(data_dummies)

In [None]:
data['linear_preds'].mean(), data[response].mean()

In [None]:
data['std_dev'] = np.std(y)

In [None]:
data[['Match_ID', 'Date', 'Total_Game_Score', 'linear_preds', 'std_dev']].head()

In [None]:
data[['Match_ID', 'Date', 'Total_Game_Score', 'linear_preds', 'std_dev']].to_csv("/total-points-score-model/data/model-predictions/linear_model_preds.csv")