In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, RocCurveDisplay
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
# setting Jedha color palette as default
pio.templates["jedha"] = go.layout.Template(
    layout_colorway=["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]
)
pio.templates.default = "jedha"
pio.renderers.default = "svg" # to be replaced by "iframe" if working on JULIE

In [17]:
# Import dataset
print("Loading dataset...")
dataset = pd.read_csv("../working_files/scores_prep.csv", index_col=0)
print("...Done.")
print()

Loading dataset...
...Done.



In [10]:
# Basic stats
print("Number of rows : {}".format(dataset.shape[0]))
print()

print("Display of dataset: ")
display(dataset.head())
print()

print("Basics statistics: ")
data_desc = dataset.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*dataset.isnull().sum()/dataset.shape[0])

Number of rows : 1289

Display of dataset: 


Unnamed: 0,season,week,awayteam,hometeam,awayscore,homescore,score_abs,winner_home,winner_away
0,2018,2,Ravens,Bengals,23,34,11,1,0
1,2018,2,Panthers,Falcons,24,31,7,1,0
2,2018,2,Chargers,Bills,31,20,-11,0,1
3,2018,2,Texans,Titans,17,20,3,1,0
4,2017,2,Texans,Bengals,13,9,-4,0,1



Basics statistics: 


Unnamed: 0,season,week,awayteam,hometeam,awayscore,homescore,score_abs,winner_home,winner_away
count,1289.0,1289.0,1289,1289,1289.0,1289.0,1289.0,1289.0,1289.0
unique,,,32,32,,,,,
top,,,Cardinals,Bengals,,,,,
freq,,,41,41,,,,,
mean,2019.027153,9.17533,,,22.494182,23.759503,1.265322,0.539178,0.460822
std,1.422984,5.062066,,,10.139945,10.328135,14.768326,0.498656,0.498656
min,2017.0,1.0,,,0.0,0.0,-49.0,0.0,0.0
25%,2018.0,5.0,,,16.0,17.0,-7.0,0.0,0.0
50%,2019.0,9.0,,,23.0,24.0,2.0,1.0,0.0
75%,2020.0,14.0,,,30.0,31.0,10.0,1.0,1.0



Percentage of missing values: 


season         0.0
week           0.0
awayteam       0.0
hometeam       0.0
awayscore      0.0
homescore      0.0
score_abs      0.0
winner_home    0.0
winner_away    0.0
dtype: float64

In [18]:
# Separate target variable Y from features X
print("Separating labels from features...")
features_list = ["season", "week", "awayteam", "hometeam", "awayscore", "homescore"]
target_variable = "score_abs"

X = dataset.loc[:,features_list]
Y = dataset.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

Y : 
0    11
1     7
2   -11
3     3
4    -4
Name: score_abs, dtype: int64

X :
   season  week  awayteam hometeam  awayscore  homescore
0    2018     2    Ravens  Bengals         23         34
1    2018     2  Panthers  Falcons         24         31
2    2018     2  Chargers    Bills         31         20
3    2018     2    Texans   Titans         17         20
4    2017     2    Texans  Bengals         13          9


In [19]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['season', 'week', 'awayscore', 'homescore']
Found categorical features  ['awayteam', 'hometeam']


In [23]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
# WARNING : don't forget stratify=Y for classification problems
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [24]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # missing values will be replaced by columns' mean
    ('scaler', StandardScaler())
])

In [25]:
# Create pipeline for categorical features
categorical_transformer = OneHotEncoder(drop='first') # no missing values in categorical data, so we only need the OHE

In [26]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [27]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()
# Label encoding
print("Encoding labels...")
print(Y_train.head())
encoder = LabelEncoder()
Y_train = encoder.fit_transform(Y_train)
print("...Done")
print(Y_train[0:5])

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()
# Label encoding
print("Encoding labels...")
print(Y_test[0:5])
Y_test = encoder.transform(Y_test)
print("...Done")
print(Y_test[0:5])

Performing preprocessings on train set...
      season  week  awayteam   hometeam  awayscore  homescore
261     2020     5  Chargers     Saints         27         30
771     2018    11   Vikings      Bears         20         25
1016    2019    14  Steelers  Cardinals         23         17
2       2018     2  Chargers      Bills         31         20
422     2020     6    Texans     Titans         36         42
...Done.
  (0, 0)	0.7142161445316457
  (0, 1)	-0.8147680502489156
  (0, 2)	0.441313025057085
  (0, 3)	0.5700310379274741
  (0, 11)	1.0
  (0, 59)	1.0
  (1, 0)	-0.7079096002090762
  (1, 1)	0.36551157714592664
  (1, 2)	-0.2619808770656903
  (1, 3)	0.09102228423772032
  (1, 33)	1.0
  (1, 35)	1.0
  (2, 0)	0.0031532721612847637
  (2, 1)	0.9556513908433477
  (2, 2)	0.03943079527264197
  (2, 3)	-0.6753917216658858
  (2, 30)	1.0
  (2, 41)	1.0
  (3, 0)	-0.7079096002090762
  (3, 1)	-1.4049078639463366
  (3, 2)	0.843195254841528
  (3, 3)	-0.38798646945203347
  (3, 11)	1.0
  (3, 37)	1.0
  (4,

ValueError: y contains previously unseen labels: [-49, -45, -43, -40, -38, -33, 42]

In [None]:
# Test pipeline
print("Preprocessing X_test...")
print(X_test.head())
print()
X_test = preprocessor.transform(X_test)
print("...Done!")
print(X_test[0:5,:]) # X_test is now a numpy array
print() 

# Label encoding
print("Encoding labels...")
print(Y_test.head())
Y_test = encoder.transform(Y_test)
print("...Done")
print(Y_test[0:5])

In [None]:
# Train model
print("Train model...")
classifier = LogisticRegression()
classifier.fit(X_train, Y_train)
print("...Done.")

In [None]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = classifier.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on training set...")
Y_train_proba = classifier.predict_proba(X_train)
print("...Done.")
print(Y_train_proba)
print()

In [None]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = classifier.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on test set...")
Y_test_proba = classifier.predict_proba(X_test)
print("...Done.")
print(Y_test_proba)
print()

In [None]:
# Print scores
print("accuracy on training set : ", accuracy_score(Y_train, Y_train_pred))
print("accuracy on test set : ", accuracy_score(Y_test, Y_test_pred))
print()

print("f1-score on training set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))
print()