In [35]:
# import modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import accuracy_score, precision_score, r2_score, mean_absolute_error


In [24]:
# load processed datasets
dfDiffVecTraining = pd.read_csv("../data/processed/TrainingDataDifferenceVectors.csv")
dfDiffVecTraining.dropna(axis = 0, inplace = True)
dfDiffVecPrediction = pd.read_csv("../data/processed/PredictionDataDifferenceVectors.csv")
dfDiffVecPrediction.dropna(axis = 0, inplace = True)

dfDiffVecTraining.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45838 entries, 0 to 46341
Data columns (total 14 columns):
Season        45838 non-null int64
Team1         45838 non-null int64
Team2         45838 non-null int64
Pred          45838 non-null float64
Location      45838 non-null int64
WinRatio      45838 non-null float64
PPG           45838 non-null float64
AllowedPPG    45838 non-null float64
FGPer         45838 non-null float64
Turnovers     45838 non-null float64
Blocks        45838 non-null float64
Rebounds      45838 non-null float64
Assists       45838 non-null float64
PerFouls      45838 non-null float64
dtypes: float64(10), int64(4)
memory usage: 5.2 MB


## Model Training

In [41]:
# feature selection
x = dfDiffVecTraining.iloc[:, 4:-1]
# target
y = dfDiffVecTraining["Pred"] 

xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.2, random_state = 6)

logReg = LogisticRegression(solver = "liblinear")
logReg.fit(xTrain, yTrain)
logRegYPred = logReg.predict(xTest)
logRegYPred = [round(y) for y in logRegYPred]
logRegAcc = accuracy_score(yTest, logRegYPred) 
logRegPrec = precision_score(yTest, logRegYPred)
logRegR2 = r2_score(yTest, logRegYPred)
logRegMAE = mean_absolute_error(yTest, logRegYPred)

gradBoostReg = GradientBoostingRegressor()
gradBoostReg.fit(xTrain, yTrain)
gradBoostRegYPred = gradBoostReg.predict(xTest)
gradBoostRegYPred = [round(y) for y in gradBoostRegYPred]
gradBoostRegAcc = accuracy_score(yTest, gradBoostRegYPred) 
gradBoostRegPrec = precision_score(yTest, gradBoostRegYPred)
gradBoostRegR2 = r2_score(yTest, gradBoostRegYPred)
gradBoostRegMAE = mean_absolute_error(yTest, gradBoostRegYPred)

print("logRegMAE: {}, gradBoostRegMAE: {}".format(logRegMAE, gradBoostRegMAE))

logRegMAE: 0.21215095986038393, gradBoostRegMAE: 0.2118237347294939


## Predicting

In [None]:
x = dfDiffVecTraining.iloc[:, 4:-1]

logRegYPred = logReg.predict(x)

gradBoostYPred = gradBoostReg.predict(x)

## Evaluate

In [22]:
# which parameters affected more
# how could be improved