In [12]:
# import libraries

import pandas as pd
import os
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score

In [3]:
# import files

df_info = pd.read_csv("working_files/games_info_prep.csv")
df_scores = pd.read_csv("working_files/scores_prep.csv")

In [4]:
# merge files

df_scores_info = df_scores.merge(df_info, on = 'idgame')
display(df_scores_info)

Unnamed: 0,Unnamed: 0_x,season,week,awayteam,hometeam,awayscore,homescore,idgame,score_abs,winner_home,...,capacity,people,line,over_under,attendance_info,dayofweek,dayofmonth,hour,month,minute
0,255,2018,1,Falcons,Eagles,12,18,401030710,6,1,...,69796,100,,,1,4,7,0,9,55
1,256,2018,1,Bengals,Colts,34,23,401030717,-11,0,...,63000,93,,,1,6,9,17,9,0
2,257,2018,1,Titans,Dolphins,20,27,401030716,7,1,...,65326,100,,,1,6,9,17,9,0
3,258,2018,1,49ers,Vikings,16,24,401030715,8,1,...,66468,100,,,1,6,9,17,9,0
4,259,2018,1,Texans,Patriots,20,27,401030714,7,1,...,65878,100,,,1,6,9,17,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1221,5349,2021,18,Jets,Bills,10,27,401326587,17,1,...,71621,90,Line: BUF -16.0,Over/Under: 43.0,1,6,9,21,1,25
1222,5351,2021,18,Patriots,Dolphins,24,33,401326592,9,1,...,65326,100,Line: NE -6.0,Over/Under: 41.0,1,6,9,21,1,25
1223,5352,2021,18,Seahawks,Cardinals,38,30,401326597,-8,0,...,65000,98,Line: ARI -5.5,Over/Under: 48.5,1,6,9,21,1,25
1224,5353,2021,18,Panthers,Buccaneers,17,41,401326596,24,1,...,65618,100,Line: TB -11.0,Over/Under: 43.0,1,6,9,21,1,25


In [6]:
df_scores_info.columns

Index(['Unnamed: 0_x', 'season', 'week', 'awayteam', 'hometeam', 'awayscore',
       'homescore', 'idgame', 'score_abs', 'winner_home', 'winner_away',
       'Unnamed: 0_y', 'date', 'stade', 'location', 'attendance', 'capacity',
       'people', 'line', 'over_under', 'attendance_info', 'dayofweek',
       'dayofmonth', 'hour', 'month', 'minute'],
      dtype='object')

In [15]:
df_scores_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1226 entries, 0 to 1225
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0_x     1226 non-null   int64 
 1   season           1226 non-null   int64 
 2   week             1226 non-null   int64 
 3   awayteam         1226 non-null   object
 4   hometeam         1226 non-null   object
 5   awayscore        1226 non-null   int64 
 6   homescore        1226 non-null   int64 
 7   idgame           1226 non-null   int64 
 8   score_abs        1226 non-null   int64 
 9   winner_home      1226 non-null   int64 
 10  winner_away      1226 non-null   int64 
 11  Unnamed: 0_y     1226 non-null   int64 
 12  date             1226 non-null   object
 13  stade            1226 non-null   object
 14  location         1226 non-null   object
 15  attendance       1226 non-null   int64 
 16  capacity         1226 non-null   int64 
 17  people           1226 non-null   

In [9]:
features_columns = ["season","week", "awayteam", "hometeam",'stade','attendance','capacity','attendance_info','dayofweek',
                    'dayofmonth','hour','month', 'minute']
X = df_scores_info.loc[:,features_columns]
y = df_scores_info.loc[:,"winner_home"]
print(X.head())
print(y.head())

   season  week awayteam  hometeam                    stade  attendance  \
0    2018     1  Falcons    Eagles  Lincoln Financial Field       69696   
1    2018     1  Bengals     Colts        Lucas Oil Stadium       58699   
2    2018     1   Titans  Dolphins        Hard Rock Stadium       65184   
3    2018     1    49ers   Vikings        U.S. Bank Stadium       66673   
4    2018     1   Texans  Patriots         Gillette Stadium       65878   

   capacity  attendance_info  dayofweek  dayofmonth  hour  month  minute  
0     69796                1          4           7     0      9      55  
1     63000                1          6           9    17      9       0  
2     65326                1          6           9    17      9       0  
3     66468                1          6           9    17      9       0  
4     65878                1          6           9    17      9       0  
0    1
1    0
2    1
3    1
4    1
Name: winner_home, dtype: int64


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.2,
                                                    stratify = y , ## Statify splitting when you're training a classification model !
                                                    random_state = 42)

In [13]:
one = OneHotEncoder()
sc = StandardScaler()
classifier = LogisticRegression()

In [16]:
# Create pipeline for numeric features
numeric_features = ["season","week", 'attendance','capacity','attendance_info','dayofweek',
                    'dayofmonth','hour','month', 'minute'] # numeric columns in X_train/X_test
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_features = ["awayteam", "hometeam",'stade'] # categorical columns in X_train/X_test
categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop="first")) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTranformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings on train set
X_train = preprocessor.fit_transform(X_train)

# Preprocessings on test set
X_test = preprocessor.transform(X_test) # Don't fit again !!

In [17]:
classifier.fit(X_train, y_train)

In [18]:
scores = cross_val_score(classifier, X_train, y_train)
print("Cross-validated score : {}\nStd : {}".format(scores.mean(), scores.std()))

Cross-validated score : 0.6020408163265306
Std : 0.03636434628817439


In [19]:
classifier.score(X_test, y_test)

0.5975609756097561