Update your README file with what you have done for this milestone. Don't forget to include the conclusions you have drawn from the models performance and how you would improve the model/data.

In [45]:
import re
import pandas as pd
import os
import numpy as np
from csv import reader
import plotly.express as px
import missingno as msno
import pickle

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

from joblib import dump, load
import joblib

pd.options.mode.chained_assignment = None

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import MinMaxScaler
from numpy import set_printoptions

from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

In [155]:
import plotly.graph_objects as go

In [3]:
# define recent matches meaning
RECENT_PREFORMANCE_MATCH_COUNT = 3

In [4]:
# Prepare current season league data
current_season_pd = pd.read_csv("results_for_prediction.csv")
current_season_pd

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,Home_Score,Away_Score,Elo_home,Elo_away,Result
0,championship,2022,1,AFC Bournemouth,West Bromwich Albion,2,2,69,61,0
1,championship,2022,1,Blackburn Rovers,Swansea City,2,1,58,67,1
2,championship,2022,1,Bristol City,Blackpool,1,1,56,45,0
3,championship,2022,1,Cardiff City,Barnsley,1,1,60,52,0
4,championship,2022,1,Derby County,Huddersfield Town,1,1,57,55,0
...,...,...,...,...,...,...,...,...,...,...
4238,serie_b,2022,32,Nuova Cosenza,Parma,1,3,44,39,0
4239,serie_b,2022,32,Benevento,Pisa SC,5,1,52,50,1
4240,serie_b,2022,32,Brescia,Vicenza,2,0,58,56,1
4241,serie_b,2022,32,Como,AC Monza,2,0,42,32,1


In [5]:
current_season_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4243 entries, 0 to 4242
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   League      4243 non-null   object
 1   Season      4243 non-null   int64 
 2   Round       4243 non-null   int64 
 3   Home_Team   4243 non-null   object
 4   Away_Team   4243 non-null   object
 5   Home_Score  4243 non-null   int64 
 6   Away_Score  4243 non-null   int64 
 7   Elo_home    4243 non-null   int64 
 8   Elo_away    4243 non-null   int64 
 9   Result      4243 non-null   int64 
dtypes: int64(7), object(3)
memory usage: 331.6+ KB


In [6]:
def getLeagueSeasonTeamBeforeRoundTotalGoal(data, league, season, team, round):
    # determine home or away and get the score 
    # get home game of the team
    home_pd = data[(data["League"]==league) & (data["Home_Team"]==team) & (data["Season"]==season) & (data["Round"]<round)]
    home_total_score = home_pd['Home_Score'].astype('Int64').sum()

    # get away game of the team
    away_pd = data[(data["League"]==league) & (data["Away_Team"]==team) & (data["Season"]==season) & (data["Round"]<round)]
    away_total_score = home_pd['Away_Score'].astype('Int64').sum()

    # calculate total goals
    return home_total_score, away_total_score


def fillWithTotalGoalSoFar(record, data):
    # get home team and away team and round
    league = record['League']
    season = record['Season']
    round = record['Round']
    hteam = record['Home_Team']
    ateam = record['Away_Team']
    
    hometeam_home_goal_so_far, hometeam_away_goal_so_far = getLeagueSeasonTeamBeforeRoundTotalGoal(data, league, season, hteam, round)
    awayteam_home_goal_so_far, awayteam_away_goal_so_far = getLeagueSeasonTeamBeforeRoundTotalGoal(data, league, season, ateam, round)

    return [hometeam_home_goal_so_far, hometeam_away_goal_so_far, awayteam_home_goal_so_far, awayteam_away_goal_so_far]

In [8]:
def findRecentPreviousRounds(currentRound, limit):
    if currentRound<=limit:
        return None
    else:
        r = []
        for l in range(limit):
            r.append(currentRound - (limit-l))
        return r


def findLeagueSeasonTeamRecentPreviousRounds(data, league, season, team, round):
    rounds = findRecentPreviousRounds(round, RECENT_PREFORMANCE_MATCH_COUNT)         # can change for optimization
    if rounds is None:
        return None

    previous_matches_pd =  data[(data["League"]==league) & ((data["Home_Team"]==team) | (data["Away_Team"]==team)) & (data["Season"]==season) & (data["Round"].isin(rounds))]
    recent_perf = 0
    for index, row in previous_matches_pd.iterrows():
        hteam = row['Home_Team']
        ateam = row['Away_Team']
        if hteam==team:
            recent_perf = recent_perf + (row['Home_Score']-row['Away_Score'])
        else:
            recent_perf = recent_perf + (row['Away_Score']-row['Home_Score'])

    return recent_perf


def fillWithRecentPerformance(record, data):
    # get home team and away team and round
    league = record['League']
    season = record['Season']
    round = record['Round']
    hteam = record['Home_Team']
    ateam = record['Away_Team']
    
    home_team_goal_diff = findLeagueSeasonTeamRecentPreviousRounds(data, league, season, hteam, round)
    away_team_goal_diff = findLeagueSeasonTeamRecentPreviousRounds(data, league, season, ateam, round)

    #print(round, hteam, ateam, home_team_goal_diff, away_team_goal_diff)

    return [home_team_goal_diff, away_team_goal_diff]

In [9]:
def getLeagueData(data, league, season=None):
    if season is None:
        league_pd =  data[(data["League"]==league)]
    else:
        league_pd =  data[(data["League"]==league) & (data["Season"]==season)]
    return league_pd

In [10]:
# load all directory as league name list
dir = "./Results"
leagues = [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir, name))]

# loop to open csv
result_with_goal_sofar_pd = pd.DataFrame()
for league in leagues:
    current_league_season_pd = getLeagueData(current_season_pd, league, season=None)
    current_league_season_pd = current_league_season_pd.reset_index(drop=True)

    if len(current_league_season_pd)>0:
        # get home team and away team total goal so far
        home_away_total_goal_sofar = current_league_season_pd.apply(fillWithTotalGoalSoFar, data=current_season_pd, axis=1)
        goal_so_far_list = np.array(home_away_total_goal_sofar.values.tolist())         # convert to list
        home_away_total_goal_sofar_pd = pd.DataFrame(goal_so_far_list, columns=["HOMETEAM_HOME_GOAL_SO_FAR", "HOMETEAM_AWAY_GOAL_SO_FAR", "AWAYTEAM_HOME_GOAL_SO_FAR", "AWAYTEAM_AWAY_GOAL_SO_FAR"])    # convert to dataframe
        current_league_season_pd.insert(loc=5, column="HOMETEAM_HOME_GOAL_SO_FAR", value=home_away_total_goal_sofar_pd["HOMETEAM_HOME_GOAL_SO_FAR"].astype('Int64')) 
        current_league_season_pd.insert(loc=6, column="HOMETEAM_AWAY_GOAL_SO_FAR", value=home_away_total_goal_sofar_pd["HOMETEAM_AWAY_GOAL_SO_FAR"].astype('Int64')) 
        current_league_season_pd.insert(loc=7, column="AWAYTEAM_HOME_GOAL_SO_FAR", value=home_away_total_goal_sofar_pd["AWAYTEAM_HOME_GOAL_SO_FAR"].astype('Int64'))     
        current_league_season_pd.insert(loc=8, column="AWAYTEAM_AWAY_GOAL_SO_FAR", value=home_away_total_goal_sofar_pd["AWAYTEAM_AWAY_GOAL_SO_FAR"].astype('Int64')) 

        # get recent performance
        home_away_recent_perf = current_league_season_pd.apply(fillWithRecentPerformance, data=current_season_pd, axis=1)
        perf_list = np.array(home_away_recent_perf.values.tolist())
        home_away_perf_pd = pd.DataFrame(perf_list, columns=["HOME_LASTEST_GOAL_DIFF", "AWAY_LASTEST_GOAL_DIFF"])
        current_league_season_pd.insert(loc=9, column="HOME_LASTEST_GOAL_DIFF", value=home_away_perf_pd["HOME_LASTEST_GOAL_DIFF"].astype('Int64')) 
        current_league_season_pd.insert(loc=10, column="AWAY_LASTEST_GOAL_DIFF", value=home_away_perf_pd["AWAY_LASTEST_GOAL_DIFF"].astype('Int64')) 

        result_with_goal_sofar_pd = pd.concat([result_with_goal_sofar_pd, current_league_season_pd])

result_with_goal_sofar_pd

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,HOMETEAM_HOME_GOAL_SO_FAR,HOMETEAM_AWAY_GOAL_SO_FAR,AWAYTEAM_HOME_GOAL_SO_FAR,AWAYTEAM_AWAY_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF,Home_Score,Away_Score,Elo_home,Elo_away,Result
0,championship,2022,1,AFC Bournemouth,West Bromwich Albion,0,0,0,0,,,2,2,69,61,0
1,championship,2022,1,Blackburn Rovers,Swansea City,0,0,0,0,,,2,1,58,67,1
2,championship,2022,1,Bristol City,Blackpool,0,0,0,0,,,1,1,56,45,0
3,championship,2022,1,Cardiff City,Barnsley,0,0,0,0,,,1,1,60,52,0
4,championship,2022,1,Derby County,Huddersfield Town,0,0,0,0,,,1,1,57,55,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314,serie_b,2022,32,Nuova Cosenza,Parma,15,17,19,14,-3,1,1,3,44,39,0
315,serie_b,2022,32,Benevento,Pisa SC,30,15,22,11,0,2,5,1,52,50,1
316,serie_b,2022,32,Brescia,Vicenza,23,19,18,22,0,-3,2,0,58,56,1
317,serie_b,2022,32,Como,AC Monza,16,15,31,13,1,8,2,0,42,32,1


In [11]:
result_with_goal_sofar_pd.drop('Home_Score', inplace=True, axis=1)
result_with_goal_sofar_pd.drop('Away_Score', inplace=True, axis=1)
result_with_goal_sofar_pd.insert(5, 'Elo_home', result_with_goal_sofar_pd.pop('Elo_home'))
result_with_goal_sofar_pd.insert(6, 'Elo_away', result_with_goal_sofar_pd.pop('Elo_away'))
result_with_goal_sofar_pd

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,Elo_home,Elo_away,HOMETEAM_HOME_GOAL_SO_FAR,HOMETEAM_AWAY_GOAL_SO_FAR,AWAYTEAM_HOME_GOAL_SO_FAR,AWAYTEAM_AWAY_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF,Result
0,championship,2022,1,AFC Bournemouth,West Bromwich Albion,69,61,0,0,0,0,,,0
1,championship,2022,1,Blackburn Rovers,Swansea City,58,67,0,0,0,0,,,1
2,championship,2022,1,Bristol City,Blackpool,56,45,0,0,0,0,,,0
3,championship,2022,1,Cardiff City,Barnsley,60,52,0,0,0,0,,,0
4,championship,2022,1,Derby County,Huddersfield Town,57,55,0,0,0,0,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314,serie_b,2022,32,Nuova Cosenza,Parma,44,39,15,17,19,14,-3,1,0
315,serie_b,2022,32,Benevento,Pisa SC,52,50,30,15,22,11,0,2,1
316,serie_b,2022,32,Brescia,Vicenza,58,56,23,19,18,22,0,-3,1
317,serie_b,2022,32,Como,AC Monza,42,32,16,15,31,13,1,8,1


In [12]:
new_season_pd = result_with_goal_sofar_pd.dropna()
new_season_pd

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,Elo_home,Elo_away,HOMETEAM_HOME_GOAL_SO_FAR,HOMETEAM_AWAY_GOAL_SO_FAR,AWAYTEAM_HOME_GOAL_SO_FAR,AWAYTEAM_AWAY_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF,Result
36,championship,2022,4,Bristol City,Swansea City,56,66,1,1,1,3,0,-3,0
37,championship,2022,4,Queens Park Rangers,Barnsley,54,53,1,1,1,1,4,0,0
38,championship,2022,4,AFC Bournemouth,Blackpool,68,46,2,2,0,3,3,-3,0
39,championship,2022,4,Blackburn Rovers,West Bromwich Albion,57,60,2,1,7,2,2,5,0
40,championship,2022,4,Cardiff City,Millwall,59,56,1,1,2,3,2,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314,serie_b,2022,32,Nuova Cosenza,Parma,44,39,15,17,19,14,-3,1,0
315,serie_b,2022,32,Benevento,Pisa SC,52,50,30,15,22,11,0,2,1
316,serie_b,2022,32,Brescia,Vicenza,58,56,23,19,18,22,0,-3,1
317,serie_b,2022,32,Como,AC Monza,42,32,16,15,31,13,1,8,1


In [13]:
old_season_pd = pd.read_csv("cleaned_dataset_b.csv")
old_season_pd

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,Elo_home,Elo_away,HOMETEAM_HOME_GOAL_SO_FAR,HOMETEAM_AWAY_GOAL_SO_FAR,AWAYTEAM_HOME_GOAL_SO_FAR,AWAYTEAM_AWAY_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF,Result
0,championship,2021,4,Coventry City,AFC Bournemouth,46.0,62.0,3,2,4,2,0,2,0
1,championship,2021,4,Norwich City,Derby County,62.0,60.0,2,2,0,6,0,-7,0
2,championship,2021,4,Blackburn Rovers,Cardiff City,58.0,60.0,5,0,1,4,8,-1,0
3,championship,2021,4,Luton Town,Wycombe Wanderers,51.0,41.0,2,1,0,3,1,-8,1
4,championship,2021,4,Middlesbrough,Barnsley,61.0,46.0,1,1,0,1,-1,-3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111647,serie_b,1997,38,Pescara,Padova,59.0,54.0,32,15,22,15,5,-3,0
111648,serie_b,1997,38,Genoa,Palermo FC,61.0,58.0,33,12,24,24,2,-1,1
111649,serie_b,1997,38,Torino,Ravenna FC,63.0,54.0,27,23,22,18,-2,-2,0
111650,serie_b,1997,38,Salernitana,Reggina,52.0,52.0,20,7,23,18,-2,3,0


In [14]:
# concat old data and new data
all_data_pd = pd.concat([old_season_pd, new_season_pd], ignore_index=True)
all_data_pd

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,Elo_home,Elo_away,HOMETEAM_HOME_GOAL_SO_FAR,HOMETEAM_AWAY_GOAL_SO_FAR,AWAYTEAM_HOME_GOAL_SO_FAR,AWAYTEAM_AWAY_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF,Result
0,championship,2021,4,Coventry City,AFC Bournemouth,46.0,62.0,3,2,4,2,0,2,0
1,championship,2021,4,Norwich City,Derby County,62.0,60.0,2,2,0,6,0,-7,0
2,championship,2021,4,Blackburn Rovers,Cardiff City,58.0,60.0,5,0,1,4,8,-1,0
3,championship,2021,4,Luton Town,Wycombe Wanderers,51.0,41.0,2,1,0,3,1,-8,1
4,championship,2021,4,Middlesbrough,Barnsley,61.0,46.0,1,1,0,1,-1,-3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115476,serie_b,2022,32,Nuova Cosenza,Parma,44.0,39.0,15,17,19,14,-3,1,0
115477,serie_b,2022,32,Benevento,Pisa SC,52.0,50.0,30,15,22,11,0,2,1
115478,serie_b,2022,32,Brescia,Vicenza,58.0,56.0,23,19,18,22,0,-3,1
115479,serie_b,2022,32,Como,AC Monza,42.0,32.0,16,15,31,13,1,8,1


In [39]:
# Read in results_for_prediction.csv
team_predict_pd = pd.read_csv("to_predict.csv")
team_predict_pd

Unnamed: 0,ELO_DIFF,RECENT_PERF_DIFF,HOME_AWAY_GOAL_DIFF,Link,HOME_WIN,League
0,15,5,5,https://www.besoccer.com/match/hull-city/hudde...,0,championship
1,-18,-6,-6,https://www.besoccer.com/match/blackpool-fc/no...,0,championship
2,14,6,6,https://www.besoccer.com/match/afc-bournemouth...,1,championship
3,-10,3,3,https://www.besoccer.com/match/barnsley-fc/rea...,0,championship
4,-18,1,1,https://www.besoccer.com/match/cardiff-city-fc...,0,championship
...,...,...,...,...,...,...
133,12,0,0,https://www.besoccer.com/match/parma-fc/como/2...,0,serie_b
134,-5,-1,-1,https://www.besoccer.com/match/as-cittadella/p...,1,serie_b
135,12,1,1,https://www.besoccer.com/match/ac-monza-brianz...,0,serie_b
136,-4,-7,-7,https://www.besoccer.com/match/pisa-calcio/bre...,0,serie_b


In [None]:
# train the model


In [17]:
# Create functions to filter different league
def getLeagueData(data, league, seasonFrom=None):
    if seasonFrom is None:
        league_pd =  data[(data["League"]==league)]
    else:
        league_pd =  data[(data["League"]==league) & (data["Season"]>=seasonFrom)]
    return league_pd

In [18]:
def get_ELO_diff(record):
    hscore = record['Elo_home']
    ascore = record['Elo_away']
    return (hscore - ascore)

In [19]:
def get_recent_goal_diff_diff(record):
    hscore = record['HOME_LASTEST_GOAL_DIFF']
    ascore = record['AWAY_LASTEST_GOAL_DIFF']
    return hscore - ascore

In [20]:
def get_home_away_total_goal_diff(record):
    hgoal = record['HOMETEAM_HOME_GOAL_SO_FAR']
    agoal = record['AWAYTEAM_AWAY_GOAL_SO_FAR']
    return hgoal - agoal

In [None]:
# LOOP ALL MODEL TYPE, LOOP ALL FROM YEAR, GET ALL PREDICT AND CALCULATE ALL SCORE
# ...

In [81]:
def getPredictScore(data):
    predict_val = data["PREDICTION"]
    actual_val = data["HOME_WIN"]
    if (predict_val == actual_val):
        return 1
    else:
        return 0

def model_predict(league, fromSeason, mtype):
    league_predict_pd = getLeagueData(team_predict_pd, league)
    
    # load model
    model = joblib.load('./models/' + mtype + '_' + league + '_from' + str(fromSeason) + '.joblib')

    # prepare data to input
    array = league_predict_pd.values
    input = array[:,0:3].astype('int')

    # scaler
    scaler = MinMaxScaler(feature_range=(0, 8))
    rescaledX = scaler.fit_transform(input)

    # summarize transformed data
    set_printoptions(precision=3)

    # predict
    prediction = model.predict(rescaledX)

    prediction_pd = pd.DataFrame(prediction, columns=["PREDICTION"])

    league_predict_pd = league_predict_pd.reset_index(drop=True)
    league_predict_pd.insert(loc=4, column="PREDICTION", value=prediction_pd["PREDICTION"].astype('Int64')) 

    # get predict score
    correct_pd = league_predict_pd.apply(getPredictScore, axis=1)
    return correct_pd.sum() / len(correct_pd)

In [94]:
def tryModels(league, season, X, y):
    test_size = 0.3
    seed = 42
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

    statistic = []

    # Model creation
    print("LogisticRegression")
    model = LogisticRegression() 
    model.fit(X_train, Y_train)
    dump(model, './models/logisticregression_' + league + '_from' + str(season) + '.joblib')      # Save the model
    train_accuracy = model.score(X_train, Y_train) 
    print("Accuracy for train: %.3f%%" % (train_accuracy*100.0))
    test_accuracy = model.score(X_test, Y_test) 
    print("Accuracy for test: %.3f%%" % (test_accuracy*100.0))
    # predict
    predict_accuracy = model_predict(league, season, "logisticregression")
    print("Accuracy for predict: %.3f%%" % (predict_accuracy*100.0))
    print()
    statistic.append([league, season, "Regression", train_accuracy, test_accuracy, predict_accuracy])

    # KNN 
    print("KNN")
    knn = KNeighborsClassifier(n_neighbors=15)
    knn.fit(X_train, Y_train)
    dump(knn, './models/knn_' + league + '_from' + str(season) + '.joblib')      # Save the model
    train_accuracy = knn.score(X_train, Y_train) 
    print("Accuracy for train: %.3f%%" % (train_accuracy*100.0))
    test_accuracy = knn.score(X_test, Y_test) 
    print("Accuracy for test: %.3f%%" % (test_accuracy*100.0))
    # predict
    predict_accuracy = model_predict(league, season, "knn")
    print("Accuracy for predict: %.3f%%" % (predict_accuracy*100.0))
    print()
    statistic.append([league, season, "KNN", train_accuracy, test_accuracy, predict_accuracy])

    # decision trees  
    print("decision trees")
    clf = DecisionTreeClassifier()
    clf.fit(X_train, Y_train)
    dump(clf, './models/decisiontrees_' + league + '_from' + str(season) + '.joblib')      # Save the model
    train_accuracy = clf.score(X_train, Y_train) 
    print("Accuracy for train: %.3f%%" % (train_accuracy*100.0))
    test_accuracy = clf.score(X_test, Y_test) 
    print("Accuracy for test: %.3f%%" % (test_accuracy*100.0))
    # predict
    predict_accuracy = model_predict(league, season, "decisiontrees")
    print("Accuracy for predict: %.3f%%" % (predict_accuracy*100.0))
    print()
    statistic.append([league, season, "DecisionTree", train_accuracy, test_accuracy, predict_accuracy])


    # random forests
    print("random forests")
    model = RandomForestClassifier(n_estimators=10, max_depth = 4)
    model.fit(X_train, Y_train)
    dump(model, './models/randomforest_' + league + '_from' + str(season) + '.joblib')      # Save the model
    train_accuracy = model.score(X_train, Y_train) 
    print("Accuracy for train: %.3f%%" % (train_accuracy*100.0))
    test_accuracy = model.score(X_test, Y_test) 
    print("Accuracy for test: %.3f%%" % (test_accuracy*100.0))
    # predict
    predict_accuracy = model_predict(league, season, "randomforest")
    print("Accuracy for predict: %.3f%%" % (predict_accuracy*100.0))
    print()
    statistic.append([league, season, "RandomForest", train_accuracy, test_accuracy, predict_accuracy])

    return statistic     # statistic data for plot chart
    

In [95]:
# load all directory as league name list
dir = "./Results"
leagues = [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir, name))]

league_score_statistic = []

# loop to open csv
result_with_goal_sofar_pd = pd.DataFrame()
for league in leagues:
    print(league)
    print("****************\n")

    for fromYear in [1990, 1995, 2000, 2005, 2010, 2015]:
        model_pd = getLeagueData(all_data_pd, league, fromYear)
        model_pd = model_pd.dropna()

        if (model_pd.shape[0]==0):
            continue

        elo_diff_pd = model_pd.apply(get_ELO_diff, axis=1)
        model_pd.drop('Elo_home', inplace=True, axis=1)
        model_pd.drop('Elo_away', inplace=True, axis=1)
        model_pd.insert(loc=5, column="ELO_DIFF", value=elo_diff_pd.astype('Int64')) 
        
        recent_perf_diff_pd = model_pd.apply(get_recent_goal_diff_diff, axis=1)
        model_pd.drop('HOME_LASTEST_GOAL_DIFF', inplace=True, axis=1)
        model_pd.drop('AWAY_LASTEST_GOAL_DIFF', inplace=True, axis=1)
        model_pd.insert(loc=6, column="RECENT_PERF_DIFF", value=recent_perf_diff_pd.astype('Int64')) 

        goal_diff_pd = model_pd.apply(get_home_away_total_goal_diff, axis=1)
        model_pd.drop('HOMETEAM_HOME_GOAL_SO_FAR', inplace=True, axis=1)
        model_pd.drop('HOMETEAM_AWAY_GOAL_SO_FAR', inplace=True, axis=1)
        model_pd.drop('AWAYTEAM_HOME_GOAL_SO_FAR', inplace=True, axis=1)
        model_pd.drop('AWAYTEAM_AWAY_GOAL_SO_FAR', inplace=True, axis=1)
        model_pd.insert(loc=7, column="HOME_AWAY_GOAL_DIFF", value=recent_perf_diff_pd.astype('Int64')) 

        # delete no value column
        model_pd.drop('League', inplace=True, axis=1)
        model_pd.drop('Season', inplace=True, axis=1)
        model_pd.drop('Round', inplace=True, axis=1)
        model_pd.drop('Home_Team', inplace=True, axis=1)
        model_pd.drop('Away_Team', inplace=True, axis=1)

        array = model_pd.values
        X = array[:,0:(array.shape[1]-1)].astype('int')
        y = array[:,(array.shape[1]-1)].astype('int')

        # Scaler
        scaler = MinMaxScaler(feature_range=(0, 8))
        rescaledX = scaler.fit_transform(X)

        # summarize transformed data
        set_printoptions(precision=3)

        # Or Standardize
        #scaler = StandardScaler().fit(X)
        #rescaledX = scaler.transform(X)

        print(f"from {fromYear}")
        print("-------------------------------------")
        statistic = tryModels(league, fromYear, rescaledX, y)
        league_score_statistic.extend(statistic)


championship
****************

from 1990
-------------------------------------
LogisticRegression
Accuracy for train: 56.989%
Accuracy for test: 55.714%
Accuracy for predict: 75.000%

KNN
Accuracy for train: 61.657%
Accuracy for test: 53.605%
Accuracy for predict: 66.667%

decision trees
Accuracy for train: 71.141%
Accuracy for test: 54.490%
Accuracy for predict: 41.667%

random forests
Accuracy for train: 57.864%
Accuracy for test: 56.122%
Accuracy for predict: 66.667%

from 1995
-------------------------------------
LogisticRegression
Accuracy for train: 58.772%
Accuracy for test: 57.610%
Accuracy for predict: 66.667%

KNN
Accuracy for train: 63.847%
Accuracy for test: 51.635%
Accuracy for predict: 75.000%

decision trees
Accuracy for train: 73.852%
Accuracy for test: 51.184%
Accuracy for predict: 33.333%

random forests
Accuracy for train: 59.497%
Accuracy for test: 57.159%
Accuracy for predict: 66.667%

from 2000
-------------------------------------
LogisticRegression
Accuracy for

In [98]:
league_score_statistic

[['championship',
  1990,
  'Regression',
  0.569886197840677,
  0.5571428571428572,
  0.75],
 ['championship',
  1990,
  'KNN',
  0.6165742632039685,
  0.5360544217687074,
  0.6666666666666666],
 ['championship',
  1990,
  'DecisionTree',
  0.7114093959731543,
  0.5448979591836735,
  0.4166666666666667],
 ['championship',
  1990,
  'RandomForest',
  0.5786402100962942,
  0.5612244897959183,
  0.6666666666666666],
 ['championship',
  1995,
  'Regression',
  0.5877235379410343,
  0.5760992108229989,
  0.6666666666666666],
 ['championship', 1995, 'KNN', 0.638472692121798, 0.5163472378804961, 0.75],
 ['championship',
  1995,
  'DecisionTree',
  0.7385210246495891,
  0.5118376550169109,
  0.3333333333333333],
 ['championship',
  1995,
  'RandomForest',
  0.5949734171097149,
  0.5715896279594137,
  0.6666666666666666],
 ['championship',
  2000,
  'Regression',
  0.5877235379410343,
  0.5760992108229989,
  0.6666666666666666],
 ['championship', 2000, 'KNN', 0.638472692121798, 0.5163472378804

In [99]:
# parpare dataframe
statistic_pd = pd.DataFrame(league_score_statistic, columns=["League", "From", "Model", "TrainScore", "TestScore", "PredictScore"]) 

In [100]:
statistic_pd


Unnamed: 0,League,From,Model,TrainScore,TestScore,PredictScore
0,championship,1990,Regression,0.569886,0.557143,0.750000
1,championship,1990,KNN,0.616574,0.536054,0.666667
2,championship,1990,DecisionTree,0.711409,0.544898,0.416667
3,championship,1990,RandomForest,0.578640,0.561224,0.666667
4,championship,1995,Regression,0.587724,0.576099,0.666667
...,...,...,...,...,...,...
331,serie_b,2010,RandomForest,0.591785,0.572360,0.900000
332,serie_b,2015,Regression,0.576793,0.588103,0.600000
333,serie_b,2015,KNN,0.617236,0.547699,0.600000
334,serie_b,2015,DecisionTree,0.710159,0.546577,0.500000


In [147]:
data = statistic_pd[ (statistic_pd["League"]=="championship") & (statistic_pd["From"]==1990) ]
data

Unnamed: 0,League,From,Model,TrainScore,TestScore,PredictScore
0,championship,1990,Regression,0.569886,0.557143,0.75
1,championship,1990,KNN,0.616574,0.536054,0.666667
2,championship,1990,DecisionTree,0.711409,0.544898,0.416667
3,championship,1990,RandomForest,0.57864,0.561224,0.666667


In [156]:
models = data.Model.unique()
models

array(['Regression', 'KNN', 'DecisionTree', 'RandomForest'], dtype=object)

In [160]:
# load all directory as league name list
dir = "./Results"
leagues = [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir, name))]

league_score_statistic = []

# loop to open csv
result_with_goal_sofar_pd = pd.DataFrame()
for league in leagues:
    for fromYear in [1990, 1995, 2000, 2005, 2010, 2015]:

        data = statistic_pd[ (statistic_pd["League"]==league) & (statistic_pd["From"]==fromYear) ]
        fig = go.Figure(data=[
            go.Bar(name='Train', x=models, y=data.TrainScore.values*100),
            go.Bar(name='Test', x=models, y=data.TestScore.values*100),
            go.Bar(name='Predict', x=models, y=data.PredictScore.values*100),
        ])
        # Change the bar mode
        fig.update_layout(barmode='group', title=league + " (train data from " + str(fromYear) + ")", xaxis=dict(
                title="Model"
            ),
            yaxis=dict(
                title="Accuracy"
            ))
        fig.show()

In [154]:




data = statistic_pd[ (statistic_pd["League"]=="championship") & (statistic_pd["From"]==1990) ]
fig1 = go.Figure(data=[
    go.Bar(name='Train', x=models, y=data.TrainScore.values*100),
    go.Bar(name='Test', x=models, y=data.TestScore.values*100),
    go.Bar(name='Predict', x=models, y=data.PredictScore.values*100),
])
# Change the bar mode
fig1.update_layout(barmode='group', title="championship (train data from 1990)", xaxis=dict(
        title="Model"
    ),
    yaxis=dict(
        title="Accuracy"
    ))
fig1.show()
# pio.write_image(fig, "./reports/yourfile.png")

data = statistic_pd[ (statistic_pd["League"]=="championship") & (statistic_pd["From"]==1995) ]
fig2 = go.Figure(data=[
    go.Bar(name='Train', x=models, y=data.TrainScore.values*100),
    go.Bar(name='Test', x=models, y=data.TestScore.values*100),
    go.Bar(name='Predict', x=models, y=data.PredictScore.values*100),
])
# Change the bar mode
fig2.update_layout(barmode='group', title="championship (train data from 1995)", xaxis=dict(
        title="Model"
    ),
    yaxis=dict(
        title="Accuracy"
    ))
fig2.show()


In [150]:
from plotly.subplots import make_subplots
figc = make_subplots(rows=2, cols=1)

figc.append_trace(fig1, row=1, col=1)
figc.append_trace(fig2, row=2, col=1)
figc.update_layout(title_text="Stacked Subplots")
figc.show()

ValueError: 
    Invalid element(s) received for the 'data' property of 
        Invalid elements include: [Figure({
    'data': [{'name': 'Train',
              'type': 'bar',
              'x': array(['Regression', 'KNN', 'DecisionTree', 'RandomForest'], dtype=object),
              'y': array([56.989, 61.657, 71.141, 57.864])},
             {'name': 'Test',
              'type': 'bar',
              'x': array(['Regression', 'KNN', 'DecisionTree', 'RandomForest'], dtype=object),
              'y': array([55.714, 53.605, 54.49 , 56.122])},
             {'name': 'Predict',
              'type': 'bar',
              'x': array(['Regression', 'KNN', 'DecisionTree', 'RandomForest'], dtype=object),
              'y': array([75.   , 66.667, 41.667, 66.667])}],
    'layout': {'template': '...'}
})]

    The 'data' property is a tuple of trace instances
    that may be specified as:
      - A list or tuple of trace instances
        (e.g. [Scatter(...), Bar(...)])
      - A single trace instance
        (e.g. Scatter(...), Bar(...), etc.)
      - A list or tuple of dicts of string/value properties where:
        - The 'type' property specifies the trace type
            One of: ['bar', 'barpolar', 'box', 'candlestick',
                     'carpet', 'choropleth', 'choroplethmapbox',
                     'cone', 'contour', 'contourcarpet',
                     'densitymapbox', 'funnel', 'funnelarea',
                     'heatmap', 'heatmapgl', 'histogram',
                     'histogram2d', 'histogram2dcontour', 'icicle',
                     'image', 'indicator', 'isosurface', 'mesh3d',
                     'ohlc', 'parcats', 'parcoords', 'pie',
                     'pointcloud', 'sankey', 'scatter',
                     'scatter3d', 'scattercarpet', 'scattergeo',
                     'scattergl', 'scattermapbox', 'scatterpolar',
                     'scatterpolargl', 'scattersmith',
                     'scatterternary', 'splom', 'streamtube',
                     'sunburst', 'surface', 'table', 'treemap',
                     'violin', 'volume', 'waterfall']

        - All remaining properties are passed to the constructor of
          the specified trace type

        (e.g. [{'type': 'scatter', ...}, {'type': 'bar, ...}])