Milestone 4 - Model Training

Task 3: Train and tune other models

This task will focus on steps 3 to 5 of the workflow mentioned in the first task of this milestone. There are many different models that you can use to train your model. You can use KNN, decision trees, random forests... You have to tune them before making a decision, and not the other way around. So first, tune all of them, and then check which one performs better on the testing set. Remember not to overfit! Some models, like decision tree, are prone to overfitting, so even if they perform very well on the training set, make sure that it can also perform well on the testing set. If that's the case, you can add different ways of regularisation. Once you picked the model, save the model as model.joblib.

In [1]:
import re
import pandas as pd
import os
import numpy as np
from csv import reader
import plotly.express as px
import missingno as msno
import pickle

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

pd.options.mode.chained_assignment = None

In [2]:
# define recent matches meaning
RECENT_PREFORMANCE_MATCH_COUNT = 3

In [8]:
# load csv (reset data)
result_with_goal_sofar_pd = pd.read_csv('cleaned_dataset.csv')
result_with_goal_sofar_pd

Unnamed: 0,Home_Team,Away_Team,Result,Home_Score,Away_Score,HOME_TOTAL_GOAL_SO_FAR,AWAY_TOTAL_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF,Link,ELO_HOME,ELO_AWAY,Season,Round,League
0,Watford,Middlesbrough,1-0,1.0,0.0,0,0,,,https://www.besoccer.com/match/watford-fc/midd...,65.0,60.0,2021,1,championship
1,Birmingham City,Brentford,1-0,1.0,0.0,0,0,,,https://www.besoccer.com/match/birmingham-city...,52.0,59.0,2021,1,championship
2,Wycombe Wanderers,Rotherham United,0-1,0.0,1.0,0,0,,,https://www.besoccer.com/match/wycombe-wandere...,41.0,48.0,2021,1,championship
3,AFC Bournemouth,Blackburn Rovers,3-2,3.0,2.0,0,0,,,https://www.besoccer.com/match/afc-bournemouth...,63.0,57.0,2021,1,championship
4,Barnsley,Luton Town,0-1,0.0,1.0,0,0,,,https://www.besoccer.com/match/barnsley-fc/lut...,47.0,50.0,2021,1,championship
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146636,Pescara,Padova,1-2,1.0,2.0,49,39,5.0,-3.0,https://www.besoccer.com/match/pescara-calcio/...,59.0,54.0,1997,38,serie_b
146637,Genoa,Palermo FC,4-1,4.0,1.0,54,39,2.0,-1.0,https://www.besoccer.com/match/genoa/palermo/1...,61.0,58.0,1997,38,serie_b
146638,Torino,Ravenna FC,0-4,0.0,4.0,45,39,-2.0,-2.0,https://www.besoccer.com/match/torino-fc/raven...,63.0,54.0,1997,38,serie_b
146639,Salernitana,Reggina,1-3,1.0,3.0,30,37,-2.0,3.0,https://www.besoccer.com/match/salernitana-cal...,52.0,52.0,1997,38,serie_b


In [9]:
# clean up records with NaN
# drop record with na
result_with_goal_sofar_pd = result_with_goal_sofar_pd.dropna()
result_with_goal_sofar_pd

Unnamed: 0,Home_Team,Away_Team,Result,Home_Score,Away_Score,HOME_TOTAL_GOAL_SO_FAR,AWAY_TOTAL_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF,Link,ELO_HOME,ELO_AWAY,Season,Round,League
36,Coventry City,AFC Bournemouth,1-3,1.0,3.0,4,5,0.0,2.0,https://www.besoccer.com/match/coventry-city/a...,46.0,62.0,2021,4,championship
37,Norwich City,Derby County,0-1,0.0,1.0,3,1,0.0,-7.0,https://www.besoccer.com/match/norwich-city-fc...,62.0,60.0,2021,4,championship
38,Blackburn Rovers,Cardiff City,0-0,0.0,0.0,11,3,8.0,-1.0,https://www.besoccer.com/match/blackburn-rover...,58.0,60.0,2021,4,championship
39,Luton Town,Wycombe Wanderers,2-0,2.0,0.0,3,0,1.0,-8.0,https://www.besoccer.com/match/luton-town-fc/w...,51.0,41.0,2021,4,championship
40,Middlesbrough,Barnsley,2-1,2.0,1.0,2,0,-1.0,-3.0,https://www.besoccer.com/match/middlesbrough-f...,61.0,46.0,2021,4,championship
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146636,Pescara,Padova,1-2,1.0,2.0,49,39,5.0,-3.0,https://www.besoccer.com/match/pescara-calcio/...,59.0,54.0,1997,38,serie_b
146637,Genoa,Palermo FC,4-1,4.0,1.0,54,39,2.0,-1.0,https://www.besoccer.com/match/genoa/palermo/1...,61.0,58.0,1997,38,serie_b
146638,Torino,Ravenna FC,0-4,0.0,4.0,45,39,-2.0,-2.0,https://www.besoccer.com/match/torino-fc/raven...,63.0,54.0,1997,38,serie_b
146639,Salernitana,Reggina,1-3,1.0,3.0,30,37,-2.0,3.0,https://www.besoccer.com/match/salernitana-cal...,52.0,52.0,1997,38,serie_b


In [10]:

# delete no value column
result_with_goal_sofar_pd.drop('Result', inplace=True, axis=1)
result_with_goal_sofar_pd.drop('Link', inplace=True, axis=1)
#result_with_goal_sofar_pd.drop('League', inplace=True, axis=1)
#result_with_goal_sofar_pd.drop('Season', inplace=True, axis=1)
#result_with_goal_sofar_pd.drop('Round', inplace=True, axis=1)
#result_with_goal_sofar_pd.drop('Home_Team', inplace=True, axis=1)
#result_with_goal_sofar_pd.drop('Away_Team', inplace=True, axis=1)
result_with_goal_sofar_pd

Unnamed: 0,Home_Team,Away_Team,Home_Score,Away_Score,HOME_TOTAL_GOAL_SO_FAR,AWAY_TOTAL_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF,ELO_HOME,ELO_AWAY,Season,Round,League
36,Coventry City,AFC Bournemouth,1.0,3.0,4,5,0.0,2.0,46.0,62.0,2021,4,championship
37,Norwich City,Derby County,0.0,1.0,3,1,0.0,-7.0,62.0,60.0,2021,4,championship
38,Blackburn Rovers,Cardiff City,0.0,0.0,11,3,8.0,-1.0,58.0,60.0,2021,4,championship
39,Luton Town,Wycombe Wanderers,2.0,0.0,3,0,1.0,-8.0,51.0,41.0,2021,4,championship
40,Middlesbrough,Barnsley,2.0,1.0,2,0,-1.0,-3.0,61.0,46.0,2021,4,championship
...,...,...,...,...,...,...,...,...,...,...,...,...,...
146636,Pescara,Padova,1.0,2.0,49,39,5.0,-3.0,59.0,54.0,1997,38,serie_b
146637,Genoa,Palermo FC,4.0,1.0,54,39,2.0,-1.0,61.0,58.0,1997,38,serie_b
146638,Torino,Ravenna FC,0.0,4.0,45,39,-2.0,-2.0,63.0,54.0,1997,38,serie_b
146639,Salernitana,Reggina,1.0,3.0,30,37,-2.0,3.0,52.0,52.0,1997,38,serie_b


In [11]:

# reorder dataframe column
result_with_goal_sofar_pd.insert(0, 'League', result_with_goal_sofar_pd.pop('League'))
result_with_goal_sofar_pd.insert(1, 'Season', result_with_goal_sofar_pd.pop('Season'))
result_with_goal_sofar_pd.insert(2, 'Round', result_with_goal_sofar_pd.pop('Round'))
result_with_goal_sofar_pd.insert(5, 'ELO_HOME', result_with_goal_sofar_pd.pop('ELO_HOME'))
result_with_goal_sofar_pd.insert(6, 'ELO_AWAY', result_with_goal_sofar_pd.pop('ELO_AWAY'))
#result_with_goal_sofar_pd.insert(0, 'ELO_HOME', result_with_goal_sofar_pd.pop('ELO_HOME'))
#result_with_goal_sofar_pd.insert(1, 'ELO_AWAY', result_with_goal_sofar_pd.pop('ELO_AWAY'))
result_with_goal_sofar_pd

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,ELO_HOME,ELO_AWAY,Home_Score,Away_Score,HOME_TOTAL_GOAL_SO_FAR,AWAY_TOTAL_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF
36,championship,2021,4,Coventry City,AFC Bournemouth,46.0,62.0,1.0,3.0,4,5,0.0,2.0
37,championship,2021,4,Norwich City,Derby County,62.0,60.0,0.0,1.0,3,1,0.0,-7.0
38,championship,2021,4,Blackburn Rovers,Cardiff City,58.0,60.0,0.0,0.0,11,3,8.0,-1.0
39,championship,2021,4,Luton Town,Wycombe Wanderers,51.0,41.0,2.0,0.0,3,0,1.0,-8.0
40,championship,2021,4,Middlesbrough,Barnsley,61.0,46.0,2.0,1.0,2,0,-1.0,-3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
146636,serie_b,1997,38,Pescara,Padova,59.0,54.0,1.0,2.0,49,39,5.0,-3.0
146637,serie_b,1997,38,Genoa,Palermo FC,61.0,58.0,4.0,1.0,54,39,2.0,-1.0
146638,serie_b,1997,38,Torino,Ravenna FC,63.0,54.0,0.0,4.0,45,39,-2.0,-2.0
146639,serie_b,1997,38,Salernitana,Reggina,52.0,52.0,1.0,3.0,30,37,-2.0,3.0


In [12]:
# find who win H:Home A:Away D:Draw
def get_result(record):
    hscore = record['Home_Score']
    ascore = record['Away_Score']
    if hscore is pd.NA or ascore is pd.NA:
        return pd.NA
    if hscore==ascore:
        return 0
    elif hscore>ascore:
        return 1
    else:
        return -1

result_pd = result_with_goal_sofar_pd.apply(get_result, axis=1)

#result_with_goal_sofar_pd.drop('Home_Score', inplace=True, axis=1)
#result_with_goal_sofar_pd.drop('Away_Score', inplace=True, axis=1)

result_with_goal_sofar_pd.insert(loc=len(result_with_goal_sofar_pd.columns), column="Result", value=result_pd.astype('Int64')) 
result_with_goal_sofar_pd

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,ELO_HOME,ELO_AWAY,Home_Score,Away_Score,HOME_TOTAL_GOAL_SO_FAR,AWAY_TOTAL_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF,Result
36,championship,2021,4,Coventry City,AFC Bournemouth,46.0,62.0,1.0,3.0,4,5,0.0,2.0,-1
37,championship,2021,4,Norwich City,Derby County,62.0,60.0,0.0,1.0,3,1,0.0,-7.0,-1
38,championship,2021,4,Blackburn Rovers,Cardiff City,58.0,60.0,0.0,0.0,11,3,8.0,-1.0,0
39,championship,2021,4,Luton Town,Wycombe Wanderers,51.0,41.0,2.0,0.0,3,0,1.0,-8.0,1
40,championship,2021,4,Middlesbrough,Barnsley,61.0,46.0,2.0,1.0,2,0,-1.0,-3.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146636,serie_b,1997,38,Pescara,Padova,59.0,54.0,1.0,2.0,49,39,5.0,-3.0,-1
146637,serie_b,1997,38,Genoa,Palermo FC,61.0,58.0,4.0,1.0,54,39,2.0,-1.0,1
146638,serie_b,1997,38,Torino,Ravenna FC,63.0,54.0,0.0,4.0,45,39,-2.0,-2.0,-1
146639,serie_b,1997,38,Salernitana,Reggina,52.0,52.0,1.0,3.0,30,37,-2.0,3.0,-1


In [13]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

def get_ELO_diff(record):
    hscore = record['ELO_HOME']
    ascore = record['ELO_AWAY']
    return hscore - ascore

def get_goal_so_far_diff(record):
    hscore = record['HOME_TOTAL_GOAL_SO_FAR']
    ascore = record['AWAY_TOTAL_GOAL_SO_FAR']
    return hscore - ascore

def get_recent_goal_diff_diff(record):
    hscore = record['HOME_LASTEST_GOAL_DIFF']
    ascore = record['AWAY_LASTEST_GOAL_DIFF']
    return hscore - ascore

def preprocess_data(result_pd):
    if result_pd.shape[0] == 0:
        return result_pd

    elo_diff_pd = result_pd.apply(get_ELO_diff, axis=1)
    result_pd.drop('ELO_HOME', inplace=True, axis=1)
    result_pd.drop('ELO_AWAY', inplace=True, axis=1)
    result_pd.insert(loc=1, column="ELO_DIFF", value=elo_diff_pd.astype('Int64')) 

    goal_so_far_diff_pd = result_pd.apply(get_goal_so_far_diff, axis=1)
    result_pd.drop('HOME_TOTAL_GOAL_SO_FAR', inplace=True, axis=1)
    result_pd.drop('AWAY_TOTAL_GOAL_SO_FAR', inplace=True, axis=1)
    result_pd.insert(loc=2, column="GOAL_SO_FAR_DIFF", value=goal_so_far_diff_pd.astype('Int64')) 

    recent_perf_diff_pd = result_pd.apply(get_recent_goal_diff_diff, axis=1)
    result_pd.drop('HOME_LASTEST_GOAL_DIFF', inplace=True, axis=1)
    result_pd.drop('AWAY_LASTEST_GOAL_DIFF', inplace=True, axis=1)
    result_pd.insert(loc=3, column="RECENT_PERF_DIFF", value=recent_perf_diff_pd.astype('Int64')) 

    result_pd.drop('Home_Score', inplace=True, axis=1)
    result_pd.drop('Away_Score', inplace=True, axis=1)
    result_pd.drop('Season', inplace=True, axis=1)
    result_pd.drop('Round', inplace=True, axis=1)
    result_pd.drop('Home_Team', inplace=True, axis=1)
    result_pd.drop('Away_Team', inplace=True, axis=1)

    return result_pd


def getKNNModelforLeague(result_league_pd, league):
    if league != "*":
        filter_result_league_pd = result_league_pd[ result_league_pd["League"]==league ]
    else:
        filter_result_league_pd = result_league_pd.copy()

    if filter_result_league_pd.shape[0] == 0:
        # skip empty league data
        return

    array = filter_result_league_pd.values
    X = array[:,1:4].astype('int')
    y = array[:,4].astype('int')
    
    test_size = 0.3
    seed = 7
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
    
    knn = KNeighborsClassifier(n_neighbors=15)
    knn.fit(X_train, y_train)

    result = knn.score(X_test, y_test) 

    if league=="*":
        print("KNN Accuracy is %.3f%%" % (result*100.0))  
    else:
        print("KNN Accuracy for " + league + " is %.3f%%" % (result*100.0))  

In [14]:
result_pd = preprocess_data(result_with_goal_sofar_pd.copy())
result_pd

Unnamed: 0,League,ELO_DIFF,GOAL_SO_FAR_DIFF,RECENT_PERF_DIFF,Result
36,championship,-16,-1,-2,-1
37,championship,2,2,7,-1
38,championship,-2,8,9,0
39,championship,10,3,9,1
40,championship,15,2,2,1
...,...,...,...,...,...
146636,serie_b,5,10,8,-1
146637,serie_b,3,15,3,1
146638,serie_b,9,6,0,-1
146639,serie_b,0,-7,-5,-1


In [15]:
getKNNModelforLeague(result_pd, "*")

dir = "./Results"
leagues = [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir, name))]
for league in leagues:
    getKNNModelforLeague(result_pd, league)

KNN Accuracy is 44.871%
KNN Accuracy for championship is 43.871%
KNN Accuracy for primeira_liga is 50.102%
KNN Accuracy for ligue_1 is 45.050%
KNN Accuracy for segunda_division is 41.620%
KNN Accuracy for 2_liga is 42.246%
KNN Accuracy for serie_a is 46.623%
KNN Accuracy for bundesliga is 46.349%
KNN Accuracy for primera_division is 47.193%
KNN Accuracy for ligue_2 is 42.021%
KNN Accuracy for premier_league is 46.643%
KNN Accuracy for eredivisie is 50.801%
KNN Accuracy for segunda_liga is 93.846%
KNN Accuracy for serie_b is 41.193%


In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression() 
model.fit(X_train, Y_train)

result = model.score(X_test, Y_test) 

print("Accuracy: %.3f%%" % (result*100.0))