Milestone 5 - Model Training

Task 4: Iteratively train the model with different subsets of the data

Take a look at the dataset you have. It comprises data from 1990 to 2020. Do you think getting data from 1990 is representative of the current football data? However, removing the data from 1990 will also leave us with fewer data points. Try to find a balance between having a representative dataset and having a large dataset. You can do it by removing some data points from the dataset, and retrain the model with the remaining data. On many data science projects, you will use Notebooks to, apart from getting the clean data and training the model, explain the model and explain the results as if you were delivering the product to a customer. This is a good time to do it, create a notebook that explains the model you trained and the results you obtained and call it model_explained.ipynb.

In [31]:
import re
import pandas as pd
import os
import numpy as np
from csv import reader
import plotly.express as px
import missingno as msno
import pickle

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import MinMaxScaler
from numpy import set_printoptions

from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

pd.options.mode.chained_assignment = None

In [32]:
# READ IN cleaned_dataset.csv
full_pd = pd.read_csv("cleaned_dataset_b.csv")
full_pd

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,Elo_home,Elo_away,HOMETEAM_HOME_GOAL_SO_FAR,HOMETEAM_AWAY_GOAL_SO_FAR,AWAYTEAM_HOME_GOAL_SO_FAR,AWAYTEAM_AWAY_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF,Result
0,championship,2021,4,Coventry City,AFC Bournemouth,46.0,62.0,3,2,4,2,0,2,0
1,championship,2021,4,Norwich City,Derby County,62.0,60.0,2,2,0,6,0,-7,0
2,championship,2021,4,Blackburn Rovers,Cardiff City,58.0,60.0,5,0,1,4,8,-1,0
3,championship,2021,4,Luton Town,Wycombe Wanderers,51.0,41.0,2,1,0,3,1,-8,1
4,championship,2021,4,Middlesbrough,Barnsley,61.0,46.0,1,1,0,1,-1,-3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111647,serie_b,1997,38,Pescara,Padova,59.0,54.0,32,15,22,15,5,-3,0
111648,serie_b,1997,38,Genoa,Palermo FC,61.0,58.0,33,12,24,24,2,-1,1
111649,serie_b,1997,38,Torino,Ravenna FC,63.0,54.0,27,23,22,18,-2,-2,0
111650,serie_b,1997,38,Salernitana,Reggina,52.0,52.0,20,7,23,18,-2,3,0


In [33]:
full_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111652 entries, 0 to 111651
Data columns (total 14 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   League                     111652 non-null  object 
 1   Season                     111652 non-null  int64  
 2   Round                      111652 non-null  int64  
 3   Home_Team                  111652 non-null  object 
 4   Away_Team                  111652 non-null  object 
 5   Elo_home                   111652 non-null  float64
 6   Elo_away                   111652 non-null  float64
 7   HOMETEAM_HOME_GOAL_SO_FAR  111652 non-null  int64  
 8   HOMETEAM_AWAY_GOAL_SO_FAR  111652 non-null  int64  
 9   AWAYTEAM_HOME_GOAL_SO_FAR  111652 non-null  int64  
 10  AWAYTEAM_AWAY_GOAL_SO_FAR  111652 non-null  int64  
 11  HOME_LASTEST_GOAL_DIFF     111652 non-null  int64  
 12  AWAY_LASTEST_GOAL_DIFF     111652 non-null  int64  
 13  Result                     11

In [34]:
# Create functions to filter different league
def getLeagueData(data, league, seasonFrom=None):
    if seasonFrom is None:
        league_pd =  data[(data["League"]==league)]
    else:
        league_pd =  data[(data["League"]==league) & (data["Season"]>=seasonFrom)]
    return league_pd

In [35]:
def get_ELO_diff(record):
    hscore = record['Elo_home']
    ascore = record['Elo_away']
    return (hscore - ascore)

In [36]:
def get_recent_goal_diff_diff(record):
    hscore = record['HOME_LASTEST_GOAL_DIFF']
    ascore = record['AWAY_LASTEST_GOAL_DIFF']
    return hscore - ascore

In [37]:
def get_home_away_total_goal_diff(record):
    hgoal = record['HOMETEAM_HOME_GOAL_SO_FAR']
    agoal = record['AWAYTEAM_AWAY_GOAL_SO_FAR']
    return hgoal - agoal

In [22]:
def tryModels(X, y):
    test_size = 0.3
    seed = 42
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

    # random forests
    model = RandomForestClassifier(n_estimators=10, max_depth = 4)
    model.fit(X_train, Y_train)
    result = model.score(X_train, Y_train) 
    print("Accuracy for train: %.3f%%" % (result*100.0))
    result = model.score(X_test, Y_test) 
    print("Accuracy for test: %.3f%%" % (result*100.0))
    print()

In [30]:
# load all directory as league name list
dir = "./Results"
leagues = [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir, name))]

# loop to open csv
result_with_goal_sofar_pd = pd.DataFrame()
for league in leagues:
    for fromYear in [1990, 1995, 2000, 2005, 2010, 2015]:
        model_pd = getLeagueData(full_pd, league, fromYear)
        model_pd = model_pd.dropna()

        if (model_pd.shape[0]==0):
            continue

        elo_diff_pd = model_pd.apply(get_ELO_diff, axis=1)
        model_pd.drop('Elo_home', inplace=True, axis=1)
        model_pd.drop('Elo_away', inplace=True, axis=1)
        model_pd.insert(loc=5, column="ELO_DIFF", value=elo_diff_pd.astype('Int64')) 
        
        recent_perf_diff_pd = model_pd.apply(get_recent_goal_diff_diff, axis=1)
        model_pd.drop('HOME_LASTEST_GOAL_DIFF', inplace=True, axis=1)
        model_pd.drop('AWAY_LASTEST_GOAL_DIFF', inplace=True, axis=1)
        model_pd.insert(loc=6, column="RECENT_PERF_DIFF", value=recent_perf_diff_pd.astype('Int64')) 

        goal_diff_pd = model_pd.apply(get_home_away_total_goal_diff, axis=1)
        model_pd.drop('HOMETEAM_HOME_GOAL_SO_FAR', inplace=True, axis=1)
        model_pd.drop('HOMETEAM_AWAY_GOAL_SO_FAR', inplace=True, axis=1)
        model_pd.drop('AWAYTEAM_HOME_GOAL_SO_FAR', inplace=True, axis=1)
        model_pd.drop('AWAYTEAM_AWAY_GOAL_SO_FAR', inplace=True, axis=1)
        model_pd.insert(loc=7, column="HOME_AWAY_GOAL_DIFF", value=recent_perf_diff_pd.astype('Int64')) 

        # delete no value column
        model_pd.drop('League', inplace=True, axis=1)
        model_pd.drop('Season', inplace=True, axis=1)
        model_pd.drop('Round', inplace=True, axis=1)
        model_pd.drop('Home_Team', inplace=True, axis=1)
        model_pd.drop('Away_Team', inplace=True, axis=1)

        array = model_pd.values
        X = array[:,0:(array.shape[1]-1)].astype('int')
        y = array[:,(array.shape[1]-1)].astype('int')

        # Scaler
        scaler = MinMaxScaler(feature_range=(0, 8))
        rescaledX = scaler.fit_transform(X)

        # summarize transformed data
        set_printoptions(precision=3)

        # Or Standardize
        #scaler = StandardScaler().fit(X)
        #rescaledX = scaler.transform(X)

        print(f"{league} (from {fromYear})")
        print("-------------------------------------")
        tryModels(rescaledX, y)
    

championship (from 1990)
-------------------------------------
Accuracy for train: 57.949%
Accuracy for test: 56.129%

championship (from 1995)
-------------------------------------
Accuracy for train: 59.398%
Accuracy for test: 56.746%

championship (from 2000)
-------------------------------------
Accuracy for train: 59.057%
Accuracy for test: 55.026%

championship (from 2005)
-------------------------------------
Accuracy for train: 67.192%
Accuracy for test: 58.088%

championship (from 2010)
-------------------------------------
Accuracy for train: 69.716%
Accuracy for test: 55.882%

championship (from 2015)
-------------------------------------
Accuracy for train: 66.877%
Accuracy for test: 58.088%

primeira_liga (from 1990)
-------------------------------------
Accuracy for train: 63.853%
Accuracy for test: 63.897%

primeira_liga (from 1995)
-------------------------------------
Accuracy for train: 64.905%
Accuracy for test: 63.255%

primeira_liga (from 2000)
--------------------

In [44]:
# try to add regularisation
model_pd = getLeagueData(full_pd, "serie_b", 2011)
model_pd

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,Elo_home,Elo_away,HOMETEAM_HOME_GOAL_SO_FAR,HOMETEAM_AWAY_GOAL_SO_FAR,AWAYTEAM_HOME_GOAL_SO_FAR,AWAYTEAM_AWAY_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF,Result
103305,serie_b,2011,4,Padova,Reggina,53.0,58.0,1,1,2,0,-1,1,1
103306,serie_b,2011,4,Vicenza,Livorno,57.0,62.0,4,1,1,5,2,-1,0
103307,serie_b,2011,4,Siena,Cittadella,64.0,52.0,2,1,2,3,1,-3,1
103308,serie_b,2011,4,Sassuolo,Torino,54.0,59.0,1,1,2,3,3,-2,0
103309,serie_b,2011,4,Piacenza,Ascoli,59.0,60.0,1,1,3,1,-3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103729,serie_b,2011,42,Cittadella,Pescara,52.0,45.0,26,24,24,19,0,-2,1
103730,serie_b,2011,42,Varese,Piacenza,46.0,58.0,32,10,32,32,-2,-2,1
103731,serie_b,2011,42,Sassuolo,Reggina,54.0,58.0,24,20,21,11,-1,1,1
103732,serie_b,2011,42,AlbinoLeffe,Siena,55.0,64.0,31,30,45,14,-1,4,1


In [45]:
model_pd = model_pd.dropna()

elo_diff_pd = model_pd.apply(get_ELO_diff, axis=1)
model_pd.drop('Elo_home', inplace=True, axis=1)
model_pd.drop('Elo_away', inplace=True, axis=1)
model_pd.insert(loc=5, column="ELO_DIFF", value=elo_diff_pd.astype('Int64')) 

recent_perf_diff_pd = model_pd.apply(get_recent_goal_diff_diff, axis=1)
model_pd.drop('HOME_LASTEST_GOAL_DIFF', inplace=True, axis=1)
model_pd.drop('AWAY_LASTEST_GOAL_DIFF', inplace=True, axis=1)
model_pd.insert(loc=6, column="RECENT_PERF_DIFF", value=recent_perf_diff_pd.astype('Int64')) 

goal_diff_pd = model_pd.apply(get_home_away_total_goal_diff, axis=1)
model_pd.drop('HOMETEAM_HOME_GOAL_SO_FAR', inplace=True, axis=1)
model_pd.drop('HOMETEAM_AWAY_GOAL_SO_FAR', inplace=True, axis=1)
model_pd.drop('AWAYTEAM_HOME_GOAL_SO_FAR', inplace=True, axis=1)
model_pd.drop('AWAYTEAM_AWAY_GOAL_SO_FAR', inplace=True, axis=1)
model_pd.insert(loc=7, column="HOME_AWAY_GOAL_DIFF", value=recent_perf_diff_pd.astype('Int64')) 

# delete no value column
model_pd.drop('League', inplace=True, axis=1)
model_pd.drop('Season', inplace=True, axis=1)
model_pd.drop('Round', inplace=True, axis=1)
model_pd.drop('Home_Team', inplace=True, axis=1)
model_pd.drop('Away_Team', inplace=True, axis=1)

array = model_pd.values
X = array[:,0:(array.shape[1]-1)].astype('int')
y = array[:,(array.shape[1]-1)].astype('int')

# Scaler
scaler = MinMaxScaler(feature_range=(0, 8))
rescaledX = scaler.fit_transform(X)

# summarize transformed data
set_printoptions(precision=3)

test_size = 0.3
seed = 42
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

model = RandomForestClassifier()
model.fit(X_train, Y_train)
result = model.score(X_train, Y_train) 
print("Accuracy for train: %.3f%%" % (result*100.0))
result = model.score(X_test, Y_test) 
print("Accuracy for test: %.3f%%" % (result*100.0))
print()

Accuracy for train: 89.000%
Accuracy for test: 54.264%



In [46]:
from sklearn.tree import export_graphviz
import pydot

# Extract the small tree
tree_small = model.estimators_[3]
    
# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = ["ELO_DIFF","RECENT_PERF_DIFF","HOME_AWAY_GOAL_DIFF"], rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png');

In [49]:
model = RandomForestClassifier(n_estimators=10, max_depth = 4)
model.fit(X_train, Y_train)
result = model.score(X_train, Y_train) 
print("Accuracy for train: %.3f%%" % (result*100.0))
result = model.score(X_test, Y_test) 
print("Accuracy for test: %.3f%%" % (result*100.0))
print()

Accuracy for train: 65.667%
Accuracy for test: 52.713%



In [50]:
from sklearn.tree import export_graphviz
import pydot

# Extract the small tree
tree_small = model.estimators_[3]
    
# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree_re.dot', feature_names = ["ELO_DIFF","RECENT_PERF_DIFF","HOME_AWAY_GOAL_DIFF"], rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('small_tree_re.dot')
graph.write_png('small_tree_re.png');

In [51]:
# Save the model
from joblib import dump, load
dump(model, 'baseline_t3.joblib')

['baseline_t3.joblib']

In [None]:
championship
-------------------------------------
Accuracy for train: 59.398%
Accuracy for test: 56.746%

primeira_liga
-------------------------------------
Accuracy for train: 65.611%
Accuracy for test: 63.407%

ligue_1
-------------------------------------
Accuracy for train: 59.887%
Accuracy for test: 59.827%

segunda_division
-------------------------------------
Accuracy for train: 57.512%
Accuracy for test: 57.433%

2_liga
-------------------------------------
Accuracy for train: 59.729%
Accuracy for test: 58.218%

serie_a
-------------------------------------
Accuracy for train: 65.601%
Accuracy for test: 63.596%

bundesliga
-------------------------------------
Accuracy for train: 61.625%
Accuracy for test: 61.172%

primera_division
-------------------------------------
Accuracy for train: 61.954%
Accuracy for test: 61.286%

ligue_2
-------------------------------------
Accuracy for train: 57.733%
Accuracy for test: 57.471%

premier_league
-------------------------------------
Accuracy for train: 64.738%
Accuracy for test: 64.636%

eredivisie
-------------------------------------
Accuracy for train: 66.816%
Accuracy for test: 67.131%

segunda_liga
-------------------------------------
Accuracy for train: 97.351%
Accuracy for test: 92.308%

serie_b
-------------------------------------
Accuracy for train: 58.189%
Accuracy for test: 57.524%
