Milestone 4 - Model Training

Task 3: Train and tune other models

This task will focus on steps 3 to 5 of the workflow mentioned in the first task of this milestone. There are many different models that you can use to train your model. You can use KNN, decision trees, random forests... You have to tune them before making a decision, and not the other way around. So first, tune all of them, and then check which one performs better on the testing set. Remember not to overfit! Some models, like decision tree, are prone to overfitting, so even if they perform very well on the training set, make sure that it can also perform well on the testing set. If that's the case, you can add different ways of regularisation. Once you picked the model, save the model as model.joblib.

In [None]:
import re
import pandas as pd
import os
import numpy as np
from csv import reader
import plotly.express as px
import missingno as msno
import pickle

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

pd.options.mode.chained_assignment = None

In [None]:
# define recent matches meaning
RECENT_PREFORMANCE_MATCH_COUNT = 3

In [None]:
# load csv (reset data)
result_with_goal_sofar_pd = pd.read_csv('cleaned_dataset.csv')
result_with_goal_sofar_pd

In [None]:
# clean up records with NaN
# drop record with na
result_with_goal_sofar_pd = result_with_goal_sofar_pd.dropna()
result_with_goal_sofar_pd

In [None]:

# delete no value column
result_with_goal_sofar_pd.drop('Result', inplace=True, axis=1)
result_with_goal_sofar_pd.drop('Link', inplace=True, axis=1)
#result_with_goal_sofar_pd.drop('League', inplace=True, axis=1)
#result_with_goal_sofar_pd.drop('Season', inplace=True, axis=1)
#result_with_goal_sofar_pd.drop('Round', inplace=True, axis=1)
#result_with_goal_sofar_pd.drop('Home_Team', inplace=True, axis=1)
#result_with_goal_sofar_pd.drop('Away_Team', inplace=True, axis=1)
result_with_goal_sofar_pd

In [None]:

# reorder dataframe column
result_with_goal_sofar_pd.insert(0, 'League', result_with_goal_sofar_pd.pop('League'))
result_with_goal_sofar_pd.insert(1, 'Season', result_with_goal_sofar_pd.pop('Season'))
result_with_goal_sofar_pd.insert(2, 'Round', result_with_goal_sofar_pd.pop('Round'))
result_with_goal_sofar_pd.insert(5, 'ELO_HOME', result_with_goal_sofar_pd.pop('ELO_HOME'))
result_with_goal_sofar_pd.insert(6, 'ELO_AWAY', result_with_goal_sofar_pd.pop('ELO_AWAY'))
#result_with_goal_sofar_pd.insert(0, 'ELO_HOME', result_with_goal_sofar_pd.pop('ELO_HOME'))
#result_with_goal_sofar_pd.insert(1, 'ELO_AWAY', result_with_goal_sofar_pd.pop('ELO_AWAY'))
result_with_goal_sofar_pd

In [None]:
# find who win H:Home A:Away D:Draw
def get_result(record):
    hscore = record['Home_Score']
    ascore = record['Away_Score']
    if hscore is pd.NA or ascore is pd.NA:
        return pd.NA
    if hscore==ascore:
        return 0
    elif hscore>ascore:
        return 1
    else:
        return 2

result_pd = result_with_goal_sofar_pd.apply(get_result, axis=1)

#result_with_goal_sofar_pd.drop('Home_Score', inplace=True, axis=1)
#result_with_goal_sofar_pd.drop('Away_Score', inplace=True, axis=1)

result_with_goal_sofar_pd.insert(loc=len(result_with_goal_sofar_pd.columns), column="Result", value=result_pd.astype('Int64')) 
result_with_goal_sofar_pd

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

def get_ELO_diff(record):
    hscore = record['ELO_HOME']
    ascore = record['ELO_AWAY']
    return hscore - ascore

def get_goal_so_far_diff(record):
    hscore = record['HOME_TOTAL_GOAL_SO_FAR']
    ascore = record['AWAY_TOTAL_GOAL_SO_FAR']
    return hscore - ascore

def get_recent_goal_diff_diff(record):
    hscore = record['HOME_LASTEST_GOAL_DIFF']
    ascore = record['AWAY_LASTEST_GOAL_DIFF']
    return hscore - ascore

def preprocess_data(result_pd):
    if result_pd.shape[0] == 0:
        return result_pd

    elo_diff_pd = result_pd.apply(get_ELO_diff, axis=1)
    result_pd.drop('ELO_HOME', inplace=True, axis=1)
    result_pd.drop('ELO_AWAY', inplace=True, axis=1)
    result_pd.insert(loc=1, column="ELO_DIFF", value=elo_diff_pd.astype('Int64')) 

    goal_so_far_diff_pd = result_pd.apply(get_goal_so_far_diff, axis=1)
    result_pd.drop('HOME_TOTAL_GOAL_SO_FAR', inplace=True, axis=1)
    result_pd.drop('AWAY_TOTAL_GOAL_SO_FAR', inplace=True, axis=1)
    result_pd.insert(loc=2, column="GOAL_SO_FAR_DIFF", value=goal_so_far_diff_pd.astype('Int64')) 

    recent_perf_diff_pd = result_pd.apply(get_recent_goal_diff_diff, axis=1)
    result_pd.drop('HOME_LASTEST_GOAL_DIFF', inplace=True, axis=1)
    result_pd.drop('AWAY_LASTEST_GOAL_DIFF', inplace=True, axis=1)
    result_pd.insert(loc=3, column="RECENT_PERF_DIFF", value=recent_perf_diff_pd.astype('Int64')) 

    result_pd.drop('Home_Score', inplace=True, axis=1)
    result_pd.drop('Away_Score', inplace=True, axis=1)
    result_pd.drop('Season', inplace=True, axis=1)
    result_pd.drop('Round', inplace=True, axis=1)
    result_pd.drop('Home_Team', inplace=True, axis=1)
    result_pd.drop('Away_Team', inplace=True, axis=1)

    return result_pd


def getKNNModelforLeague(result_league_pd, league):
    if league != "*":
        filter_result_league_pd = result_league_pd[ result_league_pd["League"]==league ]
    else:
        filter_result_league_pd = result_league_pd.copy()

    if filter_result_league_pd.shape[0] == 0:
        # skip empty league data
        return

    array = filter_result_league_pd.values
    X = array[:,1:4].astype('int')
    y = array[:,4].astype('int')
    
    test_size = 0.3
    seed = 7
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
    
    knn = KNeighborsClassifier(n_neighbors=15)
    knn.fit(X_train, y_train)

    result = knn.score(X_test, y_test) 

    if league=="*":
        print("KNN Accuracy is %.3f%%" % (result*100.0))  
    else:
        print("KNN Accuracy for " + league + " is %.3f%%" % (result*100.0))  

In [None]:
result_pd = preprocess_data(result_with_goal_sofar_pd.copy())
result_pd

In [None]:
getKNNModelforLeague(result_pd, "*")

dir = "./Results"
leagues = [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir, name))]
for league in leagues:
    getKNNModelforLeague(result_pd, league)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression() 
model.fit(X_train, Y_train)

result = model.score(X_test, Y_test) 

print("Accuracy: %.3f%%" % (result*100.0))