Milestone 4 - Model Training

Task 1: Train a simple model to obtain a baseline score

The usual workflow when you are looking for a good model is:

Train a simple model to obtain a baseline score
Perform feature selection
Train multiple models with the selected features and tune their hyperparameters
Pick the best model
Test the model on the testing set
See if there is room for improvement on the dataset (e.g. by removing features, or picking a subset that represents better the current behaviour)
In this task, we will focus on the first step.

Train a simple model, so you have a score that you know can be improved. At this stage, you should just use logistic regression or linear regression (if you are predicting a continuous value) Once you train it, save the model as a joblib file named baseline.joblib. Take a look at this link  to know how to save it

In [17]:
import re
import pandas as pd
import os
import numpy as np
from csv import reader
import plotly.express as px
import missingno as msno
import pickle

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

pd.options.mode.chained_assignment = None

In [18]:
# load csv (reset data)
result_with_goal_sofar_pd = pd.read_csv('cleaned_dataset.csv')
result_with_goal_sofar_pd

Unnamed: 0,Home_Team,Away_Team,Result,Home_Score,Away_Score,HOME_TOTAL_GOAL_SO_FAR,AWAY_TOTAL_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF,Link,ELO_HOME,ELO_AWAY,Season,Round,League
0,Watford,Middlesbrough,1-0,1.0,0.0,0,0,,,https://www.besoccer.com/match/watford-fc/midd...,65.0,60.0,2021,1,championship
1,Birmingham City,Brentford,1-0,1.0,0.0,0,0,,,https://www.besoccer.com/match/birmingham-city...,52.0,59.0,2021,1,championship
2,Wycombe Wanderers,Rotherham United,0-1,0.0,1.0,0,0,,,https://www.besoccer.com/match/wycombe-wandere...,41.0,48.0,2021,1,championship
3,AFC Bournemouth,Blackburn Rovers,3-2,3.0,2.0,0,0,,,https://www.besoccer.com/match/afc-bournemouth...,63.0,57.0,2021,1,championship
4,Barnsley,Luton Town,0-1,0.0,1.0,0,0,,,https://www.besoccer.com/match/barnsley-fc/lut...,47.0,50.0,2021,1,championship
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146636,Pescara,Padova,1-2,1.0,2.0,49,39,5.0,-3.0,https://www.besoccer.com/match/pescara-calcio/...,59.0,54.0,1997,38,serie_b
146637,Genoa,Palermo FC,4-1,4.0,1.0,54,39,2.0,-1.0,https://www.besoccer.com/match/genoa/palermo/1...,61.0,58.0,1997,38,serie_b
146638,Torino,Ravenna FC,0-4,0.0,4.0,45,39,-2.0,-2.0,https://www.besoccer.com/match/torino-fc/raven...,63.0,54.0,1997,38,serie_b
146639,Salernitana,Reggina,1-3,1.0,3.0,30,37,-2.0,3.0,https://www.besoccer.com/match/salernitana-cal...,52.0,52.0,1997,38,serie_b


In [19]:
def get_ELO_diff(record):
    hscore = record['ELO_HOME']
    ascore = record['ELO_AWAY']
    return hscore - ascore

elo_diff_pd = result_with_goal_sofar_pd.apply(get_ELO_diff, axis=1)

result_with_goal_sofar_pd.drop('ELO_HOME', inplace=True, axis=1)
result_with_goal_sofar_pd.drop('ELO_AWAY', inplace=True, axis=1)

result_with_goal_sofar_pd.insert(loc=0, column="ELO_DIFF", value=elo_diff_pd.astype('Int64')) 

result_with_goal_sofar_pd

Unnamed: 0,ELO_DIFF,Home_Team,Away_Team,Result,Home_Score,Away_Score,HOME_TOTAL_GOAL_SO_FAR,AWAY_TOTAL_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF,Link,Season,Round,League
0,5,Watford,Middlesbrough,1-0,1.0,0.0,0,0,,,https://www.besoccer.com/match/watford-fc/midd...,2021,1,championship
1,-7,Birmingham City,Brentford,1-0,1.0,0.0,0,0,,,https://www.besoccer.com/match/birmingham-city...,2021,1,championship
2,-7,Wycombe Wanderers,Rotherham United,0-1,0.0,1.0,0,0,,,https://www.besoccer.com/match/wycombe-wandere...,2021,1,championship
3,6,AFC Bournemouth,Blackburn Rovers,3-2,3.0,2.0,0,0,,,https://www.besoccer.com/match/afc-bournemouth...,2021,1,championship
4,-3,Barnsley,Luton Town,0-1,0.0,1.0,0,0,,,https://www.besoccer.com/match/barnsley-fc/lut...,2021,1,championship
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146636,5,Pescara,Padova,1-2,1.0,2.0,49,39,5.0,-3.0,https://www.besoccer.com/match/pescara-calcio/...,1997,38,serie_b
146637,3,Genoa,Palermo FC,4-1,4.0,1.0,54,39,2.0,-1.0,https://www.besoccer.com/match/genoa/palermo/1...,1997,38,serie_b
146638,9,Torino,Ravenna FC,0-4,0.0,4.0,45,39,-2.0,-2.0,https://www.besoccer.com/match/torino-fc/raven...,1997,38,serie_b
146639,0,Salernitana,Reggina,1-3,1.0,3.0,30,37,-2.0,3.0,https://www.besoccer.com/match/salernitana-cal...,1997,38,serie_b


In [20]:
def get_goal_so_far_diff(record):
    hscore = record['HOME_TOTAL_GOAL_SO_FAR']
    ascore = record['AWAY_TOTAL_GOAL_SO_FAR']
    return hscore - ascore

goal_so_far_diff_pd = result_with_goal_sofar_pd.apply(get_goal_so_far_diff, axis=1)

result_with_goal_sofar_pd.drop('HOME_TOTAL_GOAL_SO_FAR', inplace=True, axis=1)
result_with_goal_sofar_pd.drop('AWAY_TOTAL_GOAL_SO_FAR', inplace=True, axis=1)

result_with_goal_sofar_pd.insert(loc=1, column="GOAL_SO_FAR_DIFF", value=goal_so_far_diff_pd.astype('Int64')) 

result_with_goal_sofar_pd

Unnamed: 0,ELO_DIFF,GOAL_SO_FAR_DIFF,Home_Team,Away_Team,Result,Home_Score,Away_Score,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF,Link,Season,Round,League
0,5,0,Watford,Middlesbrough,1-0,1.0,0.0,,,https://www.besoccer.com/match/watford-fc/midd...,2021,1,championship
1,-7,0,Birmingham City,Brentford,1-0,1.0,0.0,,,https://www.besoccer.com/match/birmingham-city...,2021,1,championship
2,-7,0,Wycombe Wanderers,Rotherham United,0-1,0.0,1.0,,,https://www.besoccer.com/match/wycombe-wandere...,2021,1,championship
3,6,0,AFC Bournemouth,Blackburn Rovers,3-2,3.0,2.0,,,https://www.besoccer.com/match/afc-bournemouth...,2021,1,championship
4,-3,0,Barnsley,Luton Town,0-1,0.0,1.0,,,https://www.besoccer.com/match/barnsley-fc/lut...,2021,1,championship
...,...,...,...,...,...,...,...,...,...,...,...,...,...
146636,5,10,Pescara,Padova,1-2,1.0,2.0,5.0,-3.0,https://www.besoccer.com/match/pescara-calcio/...,1997,38,serie_b
146637,3,15,Genoa,Palermo FC,4-1,4.0,1.0,2.0,-1.0,https://www.besoccer.com/match/genoa/palermo/1...,1997,38,serie_b
146638,9,6,Torino,Ravenna FC,0-4,0.0,4.0,-2.0,-2.0,https://www.besoccer.com/match/torino-fc/raven...,1997,38,serie_b
146639,0,-7,Salernitana,Reggina,1-3,1.0,3.0,-2.0,3.0,https://www.besoccer.com/match/salernitana-cal...,1997,38,serie_b


In [21]:
def get_recent_goal_diff_diff(record):
    hscore = record['HOME_LASTEST_GOAL_DIFF']
    ascore = record['AWAY_LASTEST_GOAL_DIFF']
    return hscore - ascore

recent_perf_diff_pd = result_with_goal_sofar_pd.apply(get_recent_goal_diff_diff, axis=1)

result_with_goal_sofar_pd.drop('HOME_LASTEST_GOAL_DIFF', inplace=True, axis=1)
result_with_goal_sofar_pd.drop('AWAY_LASTEST_GOAL_DIFF', inplace=True, axis=1)

result_with_goal_sofar_pd.insert(loc=2, column="RECENT_PERF_DIFF", value=recent_perf_diff_pd.astype('Int64')) 

result_with_goal_sofar_pd

Unnamed: 0,ELO_DIFF,GOAL_SO_FAR_DIFF,RECENT_PERF_DIFF,Home_Team,Away_Team,Result,Home_Score,Away_Score,Link,Season,Round,League
0,5,0,,Watford,Middlesbrough,1-0,1.0,0.0,https://www.besoccer.com/match/watford-fc/midd...,2021,1,championship
1,-7,0,,Birmingham City,Brentford,1-0,1.0,0.0,https://www.besoccer.com/match/birmingham-city...,2021,1,championship
2,-7,0,,Wycombe Wanderers,Rotherham United,0-1,0.0,1.0,https://www.besoccer.com/match/wycombe-wandere...,2021,1,championship
3,6,0,,AFC Bournemouth,Blackburn Rovers,3-2,3.0,2.0,https://www.besoccer.com/match/afc-bournemouth...,2021,1,championship
4,-3,0,,Barnsley,Luton Town,0-1,0.0,1.0,https://www.besoccer.com/match/barnsley-fc/lut...,2021,1,championship
...,...,...,...,...,...,...,...,...,...,...,...,...
146636,5,10,8,Pescara,Padova,1-2,1.0,2.0,https://www.besoccer.com/match/pescara-calcio/...,1997,38,serie_b
146637,3,15,3,Genoa,Palermo FC,4-1,4.0,1.0,https://www.besoccer.com/match/genoa/palermo/1...,1997,38,serie_b
146638,9,6,0,Torino,Ravenna FC,0-4,0.0,4.0,https://www.besoccer.com/match/torino-fc/raven...,1997,38,serie_b
146639,0,-7,-5,Salernitana,Reggina,1-3,1.0,3.0,https://www.besoccer.com/match/salernitana-cal...,1997,38,serie_b


In [22]:
# clean up records with NaN
# drop record with na
result_with_goal_sofar_pd = result_with_goal_sofar_pd.dropna()
result_with_goal_sofar_pd

Unnamed: 0,ELO_DIFF,GOAL_SO_FAR_DIFF,RECENT_PERF_DIFF,Home_Team,Away_Team,Result,Home_Score,Away_Score,Link,Season,Round,League
36,-16,-1,-2,Coventry City,AFC Bournemouth,1-3,1.0,3.0,https://www.besoccer.com/match/coventry-city/a...,2021,4,championship
37,2,2,7,Norwich City,Derby County,0-1,0.0,1.0,https://www.besoccer.com/match/norwich-city-fc...,2021,4,championship
38,-2,8,9,Blackburn Rovers,Cardiff City,0-0,0.0,0.0,https://www.besoccer.com/match/blackburn-rover...,2021,4,championship
39,10,3,9,Luton Town,Wycombe Wanderers,2-0,2.0,0.0,https://www.besoccer.com/match/luton-town-fc/w...,2021,4,championship
40,15,2,2,Middlesbrough,Barnsley,2-1,2.0,1.0,https://www.besoccer.com/match/middlesbrough-f...,2021,4,championship
...,...,...,...,...,...,...,...,...,...,...,...,...
146636,5,10,8,Pescara,Padova,1-2,1.0,2.0,https://www.besoccer.com/match/pescara-calcio/...,1997,38,serie_b
146637,3,15,3,Genoa,Palermo FC,4-1,4.0,1.0,https://www.besoccer.com/match/genoa/palermo/1...,1997,38,serie_b
146638,9,6,0,Torino,Ravenna FC,0-4,0.0,4.0,https://www.besoccer.com/match/torino-fc/raven...,1997,38,serie_b
146639,0,-7,-5,Salernitana,Reggina,1-3,1.0,3.0,https://www.besoccer.com/match/salernitana-cal...,1997,38,serie_b


In [23]:
# delete no value column
result_with_goal_sofar_pd.drop('Result', inplace=True, axis=1)
result_with_goal_sofar_pd.drop('Link', inplace=True, axis=1)
result_with_goal_sofar_pd.drop('League', inplace=True, axis=1)
result_with_goal_sofar_pd.drop('Season', inplace=True, axis=1)
result_with_goal_sofar_pd.drop('Round', inplace=True, axis=1)
result_with_goal_sofar_pd.drop('Home_Team', inplace=True, axis=1)
result_with_goal_sofar_pd.drop('Away_Team', inplace=True, axis=1)
result_with_goal_sofar_pd

Unnamed: 0,ELO_DIFF,GOAL_SO_FAR_DIFF,RECENT_PERF_DIFF,Home_Score,Away_Score
36,-16,-1,-2,1.0,3.0
37,2,2,7,0.0,1.0
38,-2,8,9,0.0,0.0
39,10,3,9,2.0,0.0
40,15,2,2,2.0,1.0
...,...,...,...,...,...
146636,5,10,8,1.0,2.0
146637,3,15,3,4.0,1.0
146638,9,6,0,0.0,4.0
146639,0,-7,-5,1.0,3.0


In [24]:
# find who win H:Home A:Away D:Draw
def get_result(record):
    hscore = record['Home_Score']
    ascore = record['Away_Score']
    if hscore is pd.NA or ascore is pd.NA:
        return pd.NA
    if hscore==ascore:
        return 0
    elif hscore>ascore:
        return 1
    else:
        return -1

result_pd = result_with_goal_sofar_pd.apply(get_result, axis=1)

result_with_goal_sofar_pd.drop('Home_Score', inplace=True, axis=1)
result_with_goal_sofar_pd.drop('Away_Score', inplace=True, axis=1)

result_with_goal_sofar_pd.insert(loc=len(result_with_goal_sofar_pd.columns), column="Result", value=result_pd.astype('Int64')) 
result_with_goal_sofar_pd

Unnamed: 0,ELO_DIFF,GOAL_SO_FAR_DIFF,RECENT_PERF_DIFF,Result
36,-16,-1,-2,-1
37,2,2,7,-1
38,-2,8,9,0
39,10,3,9,1
40,15,2,2,1
...,...,...,...,...
146636,5,10,8,-1
146637,3,15,3,1
146638,9,6,0,-1
146639,0,-7,-5,-1


In [25]:
array = result_with_goal_sofar_pd.values
array

array([[-16, -1, -2, -1],
       [2, 2, 7, -1],
       [-2, 8, 9, 0],
       ...,
       [9, 6, 0, -1],
       [0, -7, -5, -1],
       [10, 0, 2, 1]], dtype=object)

In [26]:
X = array[:,0:3].astype('int')
y = array[:,3].astype('int')

In [27]:
# Scaler
from sklearn.preprocessing import MinMaxScaler
from numpy import set_printoptions

scaler = MinMaxScaler(feature_range=(0, 3))
rescaledX = scaler.fit_transform(X)

# summarize transformed data
set_printoptions(precision=3)

In [28]:
# Or Standardize
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X)
standardScaledX = scaler.transform(X)

In [29]:
test_size = 0.3
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX, y, test_size=test_size,
random_state=seed)

model = LogisticRegression() 
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test) 

print("Accuracy: %.3f%%" % (result*100.0))

Accuracy: 48.797%


In [30]:
# Save the model
from joblib import dump, load
dump(model, 'baseline.joblib')

['baseline.joblib']

In [31]:
# Load the model
loaded_model = load('baseline.joblib') 

Some findings:

KNN:
- Very simple 
- Save model need to save with all sample data
- Train is fast, predict is slow