Milestone 4 - Model Training

Task 1: Train a simple model to obtain a baseline score

The usual workflow when you are looking for a good model is:

Train a simple model to obtain a baseline score
Perform feature selection
Train multiple models with the selected features and tune their hyperparameters
Pick the best model
Test the model on the testing set
See if there is room for improvement on the dataset (e.g. by removing features, or picking a subset that represents better the current behaviour)
In this task, we will focus on the first step.

Train a simple model, so you have a score that you know can be improved. At this stage, you should just use logistic regression or linear regression (if you are predicting a continuous value) Once you train it, save the model as a joblib file named baseline.joblib. Take a look at this link  to know how to save it

In [39]:
import re
import pandas as pd
import os
import numpy as np
from csv import reader
import plotly.express as px
import missingno as msno
import pickle

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

pd.options.mode.chained_assignment = None

In [40]:
# READ IN cleaned_dataset.csv
full_pd = pd.read_csv("cleaned_dataset.csv")
full_pd

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,Elo_home,Elo_away,HOMETEAM_HOME_GOAL_SO_FAR,HOMETEAM_AWAY_GOAL_SO_FAR,AWAYTEAM_HOME_GOAL_SO_FAR,AWAYTEAM_AWAY_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF,Home_Score,Away_Score,Result
0,championship,2021,4,Coventry City,AFC Bournemouth,46.0,62.0,3,2,4,2,0,2,1,3,-1
1,championship,2021,4,Norwich City,Derby County,62.0,60.0,2,2,0,6,0,-7,0,1,-1
2,championship,2021,4,Blackburn Rovers,Cardiff City,58.0,60.0,5,0,1,4,8,-1,0,0,0
3,championship,2021,4,Luton Town,Wycombe Wanderers,51.0,41.0,2,1,0,3,1,-8,2,0,1
4,championship,2021,4,Middlesbrough,Barnsley,61.0,46.0,1,1,0,1,-1,-3,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111647,serie_b,1997,38,Pescara,Padova,59.0,54.0,32,15,22,15,5,-3,1,2,-1
111648,serie_b,1997,38,Genoa,Palermo FC,61.0,58.0,33,12,24,24,2,-1,4,1,1
111649,serie_b,1997,38,Torino,Ravenna FC,63.0,54.0,27,23,22,18,-2,-2,0,4,-1
111650,serie_b,1997,38,Salernitana,Reggina,52.0,52.0,20,7,23,18,-2,3,1,3,-1


In [41]:
# Create functions to filter different league
def getLeagueData(data, league, season=None):
    if season is None:
        league_pd =  data[(data["League"]==league)]
    else:
        league_pd =  data[(data["League"]==league) & (data["Season"]==season)]
    return league_pd

In [42]:
model_pd = getLeagueData(full_pd, "serie_b", 2011)
model_pd

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,Elo_home,Elo_away,HOMETEAM_HOME_GOAL_SO_FAR,HOMETEAM_AWAY_GOAL_SO_FAR,AWAYTEAM_HOME_GOAL_SO_FAR,AWAYTEAM_AWAY_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF,Home_Score,Away_Score,Result
103305,serie_b,2011,4,Padova,Reggina,53.0,58.0,1,1,2,0,-1,1,4,0,1
103306,serie_b,2011,4,Vicenza,Livorno,57.0,62.0,4,1,1,5,2,-1,0,0,0
103307,serie_b,2011,4,Siena,Cittadella,64.0,52.0,2,1,2,3,1,-3,3,1,1
103308,serie_b,2011,4,Sassuolo,Torino,54.0,59.0,1,1,2,3,3,-2,1,2,-1
103309,serie_b,2011,4,Piacenza,Ascoli,59.0,60.0,1,1,3,1,-3,2,2,4,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103729,serie_b,2011,42,Cittadella,Pescara,52.0,45.0,26,24,24,19,0,-2,3,2,1
103730,serie_b,2011,42,Varese,Piacenza,46.0,58.0,32,10,32,32,-2,-2,1,0,1
103731,serie_b,2011,42,Sassuolo,Reggina,54.0,58.0,24,20,21,11,-1,1,3,2,1
103732,serie_b,2011,42,AlbinoLeffe,Siena,55.0,64.0,31,30,45,14,-1,4,1,0,1


The usual workflow when you are looking for a good model is:

Train a simple model to obtain a baseline score
Perform feature selection
Train multiple models with the selected features and tune their hyperparameters
Pick the best model
Test the model on the testing set
See if there is room for improvement on the dataset (e.g. by removing features, or picking a subset that represents better the current behaviour)

# M4-T1 create simple model 
"""
In this task, we will focus on the first step.

Train a simple model, so you have a score that you know can be improved. 
At this stage, you should just use logistic regression or linear regression (if you are predicting a continuous value) 
Once you train it, save the model as a joblib file named baseline.joblib. 
"""

In [43]:
# delete no value column
# League	Season	Round	Home_Team	Away_Team Home_Score	Away_Score
model_pd.drop('League', inplace=True, axis=1)
model_pd.drop('Season', inplace=True, axis=1)
model_pd.drop('Round', inplace=True, axis=1)
model_pd.drop('Home_Team', inplace=True, axis=1)
model_pd.drop('Away_Team', inplace=True, axis=1)
model_pd.drop('Home_Score', inplace=True, axis=1)
model_pd.drop('Away_Score', inplace=True, axis=1)
model_pd

Unnamed: 0,Elo_home,Elo_away,HOMETEAM_HOME_GOAL_SO_FAR,HOMETEAM_AWAY_GOAL_SO_FAR,AWAYTEAM_HOME_GOAL_SO_FAR,AWAYTEAM_AWAY_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF,Result
103305,53.0,58.0,1,1,2,0,-1,1,1
103306,57.0,62.0,4,1,1,5,2,-1,0
103307,64.0,52.0,2,1,2,3,1,-3,1
103308,54.0,59.0,1,1,2,3,3,-2,-1
103309,59.0,60.0,1,1,3,1,-3,2,-1
...,...,...,...,...,...,...,...,...,...
103729,52.0,45.0,26,24,24,19,0,-2,1
103730,46.0,58.0,32,10,32,32,-2,-2,1
103731,54.0,58.0,24,20,21,11,-1,1,1
103732,55.0,64.0,31,30,45,14,-1,4,1


In [44]:
array = model_pd.values
array

array([[53., 58.,  1., ..., -1.,  1.,  1.],
       [57., 62.,  4., ...,  2., -1.,  0.],
       [64., 52.,  2., ...,  1., -3.,  1.],
       ...,
       [54., 58., 24., ..., -1.,  1.,  1.],
       [55., 64., 31., ..., -1.,  4.,  1.],
       [59., 54., 25., ..., -1., -2.,  1.]])

In [45]:
X = array[:,0:8].astype('int')
y = array[:,8].astype('int')

In [46]:
X

array([[53, 58,  1, ...,  0, -1,  1],
       [57, 62,  4, ...,  5,  2, -1],
       [64, 52,  2, ...,  3,  1, -3],
       ...,
       [54, 58, 24, ..., 11, -1,  1],
       [55, 64, 31, ..., 14, -1,  4],
       [59, 54, 25, ..., 21, -1, -2]])

In [47]:
y

array([ 1,  0,  1, -1, -1, -1,  1,  1,  0,  1,  1,  0,  0,  1,  1,  1,  0,
        1,  0,  0,  0,  1, -1,  0,  1,  1,  1,  1,  1,  1,  1,  1,  0, -1,
       -1, -1,  1,  0,  1, -1,  1,  1,  1,  1,  0,  1, -1,  1,  0,  1,  0,
        1, -1,  1,  1,  1,  0,  0,  1,  0,  1,  0,  1,  1,  1,  0,  1, -1,
        0,  1,  1,  0, -1, -1,  1,  1,  1,  1,  1, -1,  0,  1,  1,  0,  0,
       -1,  1,  0,  1,  1,  0,  1,  1,  1, -1,  0, -1,  1,  1,  1, -1,  1,
        0, -1,  0,  0,  0,  1,  1,  0,  1, -1, -1,  0, -1,  0,  0,  0, -1,
        1,  0, -1,  1,  1,  1,  1, -1,  0,  0,  1, -1,  1,  1,  0,  1,  1,
        0,  1, -1,  0,  1,  1,  0, -1, -1, -1,  0, -1,  1,  0,  0, -1,  0,
       -1,  1,  1,  1,  1,  0,  1, -1,  0,  1,  0,  1,  1,  1,  0, -1,  0,
       -1,  1, -1,  1,  1, -1,  0,  1, -1,  0,  0,  1,  1,  1,  1,  0,  0,
       -1,  1,  0,  1,  1,  1,  0,  1,  1,  0,  1,  1,  0,  1, -1,  1, -1,
        0,  0,  1,  0,  0, -1,  0,  0,  1,  0,  1,  0,  0,  1,  0,  1,  0,
        0,  0, -1,  1,  0

In [48]:
# Scaler
from sklearn.preprocessing import MinMaxScaler
from numpy import set_printoptions

scaler = MinMaxScaler(feature_range=(0, 8))
rescaledX = scaler.fit_transform(X)

# summarize transformed data
set_printoptions(precision=3)

In [49]:
# Or Standardize
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X)
standardScaledX = scaler.transform(X)

In [50]:
test_size = 0.3
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX, y, test_size=test_size,
random_state=seed)

model = LogisticRegression() 
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [51]:
result = model.score(X_train, Y_train) 
print("Accuracy for train: %.3f%%" % (result*100.0))

Accuracy for train: 47.333%


In [52]:
result = model.score(X_test, Y_test) 
print("Accuracy for test: %.3f%%" % (result*100.0))

Accuracy for test: 42.636%


In [53]:
# Save the model
from joblib import dump, load
dump(model, 'baseline.joblib')

['baseline.joblib']

In [31]:
# Load the model
loaded_model = load('baseline.joblib') 

Some findings:

KNN:
- Very simple 
- Save model need to save with all sample data
- Train is fast, predict is slow