In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [2]:
data_df = pd.read_csv('18-endpoints-4-cols.csv', index_col=[0,1])
games_df = pd.read_csv('games_df_2015-2020.csv').drop('Unnamed: 0', axis='columns')

In [3]:
games_df.head()

Unnamed: 0,Date,Home,Away,DidHomeWin
0,2014-11-01,Charlotte,Memphis,False
1,2014-11-01,New Orleans,Dallas,False
2,2014-11-01,Orlando,Toronto,False
3,2014-11-01,Philadelphia,Miami,False
4,2014-11-01,Washington,Milwaukee,True


In [23]:
maxs = data_df.max()
mins = data_df.min()
for col in data_df:
    # print(col)
    # print(means[col])
    data_df[col] = data_df[col].apply(lambda x: (x - mins[col])/(maxs[col] - mins[col]))

In [24]:
training_data = []
for i in range(len(games_df)):
    try:
        game = games_df.loc[i]

        homeTeam, awayTeam = str(game.Home), str(game.Away)
        date_str = game.Date
        home_index = (date_str, homeTeam) # Index to get the stats for a team on a certain date from data_df
        away_index = (date_str, awayTeam)
        home_stats = data_df.loc[home_index].to_numpy()
        home_stats = np.append(home_stats, [1]) # append 1 to array representing that this team was the home team
        away_stats = data_df.loc[away_index].to_numpy()
        away_stats = np.append(away_stats, [0])
        # array_to_append = np.array([ np.array([home_stats.to_numpy(), away_stats.to_numpy()]), game.DidHomeWin])
        # stats = np.array([home_stats.to_numpy(), away_stats.to_numpy()]) # make a 2d array for model eventually
        training_data.append([home_stats, away_stats, int(game.DidHomeWin)]) # [1, 0 ] if AWAY WON,and [0,1] if HOME WON
    except KeyError as e:
        pass
        # print(i)
        # print(date_str) 
        # print(home_index)

In [25]:
print(len(training_data))
len(training_data[0][0])
# print(i)

5270


73

In [26]:
num_data_points = len(training_data[0][0])*2
x_vals = [x[0:2] for x in training_data]
x_vals = np.array(x_vals)
x_vals = np.resize(x_vals, (5270, num_data_points))
y_vals = [y[2] for y in training_data]
y_vals = np.array(y_vals, dtype='int')
print(f'x_vals[0]: {x_vals[0]} \ny_vals[0]: {y_vals[0]}')
print(f'x_vals.shape: {x_vals.shape} \ny_vals.shape: {y_vals.shape}')

x_vals[0]: [0.30077121 0.34065934 0.35185185 0.5        0.20887728 0.31446541
 0.36734694 0.24553571 0.44204852 0.4494382  0.54660348 0.42161017
 0.33933934 0.378327   0.30068337 0.77189409 0.42857143 0.38461538
 0.44573643 0.61904762 0.51612903 0.45041322 0.54782609 0.63203463
 0.22173275 0.13179643 0.17077244 0.28212291 0.78512397 0.71212121
 0.72151899 0.71612903 0.49013158 0.46138996 0.47250509 0.69863014
 0.35672515 0.35714286 0.31225296 0.70588235 0.35714286 0.41153846
 0.41071429 0.89686099 0.42019544 0.4093178  0.53781513 0.41147132
 0.41085271 0.47118156 0.47692308 0.29054054 0.30414747 0.37325905
 0.2260274  0.55172414 0.4097561  0.37951807 0.56962025 0.
 0.5902439  0.62048193 0.43037975 1.         0.69585253 0.62674095
 0.7739726  0.44827586 0.333      0.333      0.5        0.
 1.         0.35989717 0.37676609 0.11111111 0.6        0.31331593
 0.37735849 0.30612245 0.45758929 0.41509434 0.43071161 0.3206951
 0.63347458 0.1981982  0.28897338 0.01822323 0.48065173 0.46857143
 

In [27]:
x_vals[0]

array([0.30077121, 0.34065934, 0.35185185, 0.5       , 0.20887728,
       0.31446541, 0.36734694, 0.24553571, 0.44204852, 0.4494382 ,
       0.54660348, 0.42161017, 0.33933934, 0.378327  , 0.30068337,
       0.77189409, 0.42857143, 0.38461538, 0.44573643, 0.61904762,
       0.51612903, 0.45041322, 0.54782609, 0.63203463, 0.22173275,
       0.13179643, 0.17077244, 0.28212291, 0.78512397, 0.71212121,
       0.72151899, 0.71612903, 0.49013158, 0.46138996, 0.47250509,
       0.69863014, 0.35672515, 0.35714286, 0.31225296, 0.70588235,
       0.35714286, 0.41153846, 0.41071429, 0.89686099, 0.42019544,
       0.4093178 , 0.53781513, 0.41147132, 0.41085271, 0.47118156,
       0.47692308, 0.29054054, 0.30414747, 0.37325905, 0.2260274 ,
       0.55172414, 0.4097561 , 0.37951807, 0.56962025, 0.        ,
       0.5902439 , 0.62048193, 0.43037975, 1.        , 0.69585253,
       0.62674095, 0.7739726 , 0.44827586, 0.333     , 0.333     ,
       0.5       , 0.        , 1.        , 0.35989717, 0.37676

In [35]:
x_train, x_test, y_train, y_test = train_test_split(x_vals, y_vals, test_size=0.2, random_state=0)

In [29]:
# scaler = StandardScaler()
# x_train = scaler.fit_transform(x_train)

In [36]:
model = LogisticRegression(solver='liblinear', random_state=0)
model.fit(x_train, y_train)

LogisticRegression(random_state=0, solver='liblinear')

In [31]:
# x_test = scaler.transform(x_test)

In [37]:
y_pred = model.predict(x_test)

In [38]:
model.score(x_train, y_train)

0.6776565464895635

In [39]:
model.score(x_test, y_test)

0.6802656546489564

In [40]:
confusion_matrix(y_test, y_pred)

array([[214, 216],
       [121, 503]], dtype=int64)

In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.49      0.55       430
           1       0.69      0.80      0.74       624

    accuracy                           0.67      1054
   macro avg       0.66      0.64      0.65      1054
weighted avg       0.67      0.67      0.66      1054

