The data you scraped needs to be cleaned. The pipeline you created should be used to clean the data. The cleaned data should be saved as cleaned_results.csv After going through the pipeline, you will be able to get the same features you used for training the model and put the in the files of To_Predict.zip. Save the csv files with the new features as to_predict.csv.

In [71]:
import re
import pandas as pd
import os
import numpy as np
from csv import reader
import plotly.express as px
import missingno as msno
import pickle

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

pd.options.mode.chained_assignment = None

In [72]:
# define recent matches meaning
RECENT_PREFORMANCE_MATCH_COUNT = 3

In [73]:
# load all directory as league name list
dir = "./Predict/To_Predict/"
leagues = [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir, name))]

# loop to open csv
predict_pd = pd.DataFrame()
for league in leagues:
    league_folder = os.path.join(dir, league)

    csv_file_for_league = [os.path.join(league_folder, name) for name in os.listdir(league_folder) if name.endswith('.csv')]
    pkl_file_for_league = [os.path.join(league_folder, name) for name in os.listdir(league_folder) if name.endswith('.pkl')]


    if len(csv_file_for_league)==1 and len(pkl_file_for_league)==1:
        csv_filename = csv_file_for_league[0]
        pkl_filename = pkl_file_for_league[0]

        # ,Home_Team,Away_Team,Link,Season,Round,League
        current_league_season_pd = pd.read_csv(csv_filename, skiprows=[0], names=["Home_Team", "Away_Team", "Link", "Season", "Round", "League"])

        if len(current_league_season_pd)>0:
            # load pickle and read content
            d = pickle.load(open(pkl_filename, 'rb'))
            elo_key_df = pd.DataFrame(d.keys(), columns=["link"])
            elo_val_df = pd.DataFrame.from_dict(d.values())
            elo_df = elo_key_df.join(elo_val_df)

            current_league_season_pd = current_league_season_pd.merge(elo_df, left_on='Link', right_on='link')

            predict_pd = pd.concat([predict_pd, current_league_season_pd])

predict_pd

Unnamed: 0,Home_Team,Away_Team,Link,Season,Round,League,link,Elo_home,Elo_away
0,Hull City,Huddersfield Town,https://www.besoccer.com/match/hull-city/hudde...,2022,40,championship,https://www.besoccer.com/match/hull-city/hudde...,70,55
1,Blackpool,Nottingham Forest,https://www.besoccer.com/match/blackpool-fc/no...,2022,40,championship,https://www.besoccer.com/match/blackpool-fc/no...,39,57
2,AFC Bournemouth,Bristol City,https://www.besoccer.com/match/afc-bournemouth...,2022,40,championship,https://www.besoccer.com/match/afc-bournemouth...,67,53
3,Barnsley,Reading,https://www.besoccer.com/match/barnsley-fc/rea...,2022,40,championship,https://www.besoccer.com/match/barnsley-fc/rea...,50,60
4,Cardiff City,Swansea City,https://www.besoccer.com/match/cardiff-city-fc...,2022,40,championship,https://www.besoccer.com/match/cardiff-city-fc...,57,75
...,...,...,...,...,...,...,...,...,...
5,Parma,Como,https://www.besoccer.com/match/parma-fc/como/2...,2022,33,serie_b,https://www.besoccer.com/match/parma-fc/como/2...,56,44
6,Cittadella,Perugia,https://www.besoccer.com/match/as-cittadella/p...,2022,33,serie_b,https://www.besoccer.com/match/as-cittadella/p...,57,62
7,AC Monza,Ascoli,https://www.besoccer.com/match/ac-monza-brianz...,2022,33,serie_b,https://www.besoccer.com/match/ac-monza-brianz...,64,52
8,Pisa SC,Brescia,https://www.besoccer.com/match/pisa-calcio/bre...,2022,33,serie_b,https://www.besoccer.com/match/pisa-calcio/bre...,57,61


In [74]:
predict_pd.drop('Link', inplace=True, axis=1)
predict_pd.drop('link', inplace=True, axis=1)
predict_pd

Unnamed: 0,Home_Team,Away_Team,Season,Round,League,Elo_home,Elo_away
0,Hull City,Huddersfield Town,2022,40,championship,70,55
1,Blackpool,Nottingham Forest,2022,40,championship,39,57
2,AFC Bournemouth,Bristol City,2022,40,championship,67,53
3,Barnsley,Reading,2022,40,championship,50,60
4,Cardiff City,Swansea City,2022,40,championship,57,75
...,...,...,...,...,...,...,...
5,Parma,Como,2022,33,serie_b,56,44
6,Cittadella,Perugia,2022,33,serie_b,57,62
7,AC Monza,Ascoli,2022,33,serie_b,64,52
8,Pisa SC,Brescia,2022,33,serie_b,57,61


In [75]:
# reorder dataframe column
predict_pd.insert(0, 'League', predict_pd.pop('League'))
predict_pd.insert(1, 'Season', predict_pd.pop('Season'))
predict_pd.insert(2, 'Round', predict_pd.pop('Round'))
predict_pd.insert(3, 'Home_Team', predict_pd.pop('Home_Team'))
predict_pd.insert(4, 'Away_Team', predict_pd.pop('Away_Team'))
predict_pd.insert(5, 'Elo_home', predict_pd.pop('Elo_home').astype('int'))
predict_pd.insert(6, 'Elo_away', predict_pd.pop('Elo_away').astype('int'))

predict_pd

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,Elo_home,Elo_away
0,championship,2022,40,Hull City,Huddersfield Town,70,55
1,championship,2022,40,Blackpool,Nottingham Forest,39,57
2,championship,2022,40,AFC Bournemouth,Bristol City,67,53
3,championship,2022,40,Barnsley,Reading,50,60
4,championship,2022,40,Cardiff City,Swansea City,57,75
...,...,...,...,...,...,...,...
5,serie_b,2022,33,Parma,Como,56,44
6,serie_b,2022,33,Cittadella,Perugia,57,62
7,serie_b,2022,33,AC Monza,Ascoli,64,52
8,serie_b,2022,33,Pisa SC,Brescia,57,61


In [76]:
# Read in results_for_prediction.csv
prediction_result_pd = pd.read_csv("results_for_prediction.csv")
prediction_result_pd

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,Home_Score,Away_Score,Elo_home,Elo_away,Result
0,championship,2022,1,AFC Bournemouth,West Bromwich Albion,2,2,69,61,0
1,championship,2022,1,Blackburn Rovers,Swansea City,2,1,58,67,1
2,championship,2022,1,Bristol City,Blackpool,1,1,56,45,0
3,championship,2022,1,Cardiff City,Barnsley,1,1,60,52,0
4,championship,2022,1,Derby County,Huddersfield Town,1,1,57,55,0
...,...,...,...,...,...,...,...,...,...,...
4238,serie_b,2022,32,Nuova Cosenza,Parma,1,3,44,39,0
4239,serie_b,2022,32,Benevento,Pisa SC,5,1,52,50,1
4240,serie_b,2022,32,Brescia,Vicenza,2,0,58,56,1
4241,serie_b,2022,32,Como,AC Monza,2,0,42,32,1


In [77]:
# Create functions to filter different league
def getLeagueData(data, league, season=None):
    if season is None:
        league_pd =  data[(data["League"]==league)]
    else:
        league_pd =  data[(data["League"]==league) & (data["Season"]==season)]
    return league_pd

In [78]:
# filter predict_result and predict dataframe
current_season_result = getLeagueData(prediction_result_pd, "serie_b")
current_season_result

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,Home_Score,Away_Score,Elo_home,Elo_away,Result
3924,serie_b,2022,1,Frosinone,Parma,2,2,58,57,0
3925,serie_b,2022,1,Pordenone,Perugia,0,1,48,64,0
3926,serie_b,2022,1,Cittadella,Vicenza,1,0,59,49,1
3927,serie_b,2022,1,Reggina,AC Monza,0,0,43,54,0
3928,serie_b,2022,1,Ternana Calcio,Brescia,0,2,55,62,0
...,...,...,...,...,...,...,...,...,...,...
4238,serie_b,2022,32,Nuova Cosenza,Parma,1,3,44,39,0
4239,serie_b,2022,32,Benevento,Pisa SC,5,1,52,50,1
4240,serie_b,2022,32,Brescia,Vicenza,2,0,58,56,1
4241,serie_b,2022,32,Como,AC Monza,2,0,42,32,1


In [79]:
# filter predict_result and predict dataframe
team_predict_pd = getLeagueData(predict_pd, "serie_b")
team_predict_pd

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,Elo_home,Elo_away
0,serie_b,2022,33,Cremonese,US Alessandria,64,51
1,serie_b,2022,33,Pordenone,Frosinone,44,58
2,serie_b,2022,33,Reggina,Benevento,44,60
3,serie_b,2022,33,Ternana Calcio,Lecce,53,65
4,serie_b,2022,33,SPAL,Cosenza,57,45
5,serie_b,2022,33,Parma,Como,56,44
6,serie_b,2022,33,Cittadella,Perugia,57,62
7,serie_b,2022,33,AC Monza,Ascoli,64,52
8,serie_b,2022,33,Pisa SC,Brescia,57,61
9,serie_b,2022,33,Vicenza,Crotone,46,56


In [None]:
# Get HOMETEAM_HOME_GOAL_SO_FAR	HOMETEAM_AWAY_GOAL_SO_FAR	AWAYTEAM_HOME_GOAL_SO_FAR	AWAYTEAM_AWAY_GOAL_SO_FAR	HOME_LASTEST_GOAL_DIFF	AWAY_LASTEST_GOAL_DIFF

In [80]:
def getLeagueSeasonTeamBeforeRoundTotalGoal(data, team, round):
    # determine home or away and get the score 
    # get home game of the team
    home_pd = data[(data["Home_Team"]==team) & (data["Round"]<round)]
    home_total_score = home_pd['Home_Score'].astype('Int64').sum()

    # get away game of the team
    away_pd = data[(data["Away_Team"]==team) & (data["Round"]<round)]
    away_total_score = home_pd['Away_Score'].astype('Int64').sum()

    # calculate total goals
    return home_total_score, away_total_score


def fillWithTotalGoalSoFar(record, data):
    # get home team and away team and round
    round = record['Round']
    hteam = record['Home_Team']
    ateam = record['Away_Team']
    
    hometeam_home_goal_so_far, hometeam_away_goal_so_far = getLeagueSeasonTeamBeforeRoundTotalGoal(data, hteam, round)
    awayteam_home_goal_so_far, awayteam_away_goal_so_far = getLeagueSeasonTeamBeforeRoundTotalGoal(data, ateam, round)

    return [hometeam_home_goal_so_far, hometeam_away_goal_so_far, awayteam_home_goal_so_far, awayteam_away_goal_so_far]

In [81]:
# HOMETEAM_HOME_GOAL_SO_FAR	HOMETEAM_AWAY_GOAL_SO_FAR	AWAYTEAM_HOME_GOAL_SO_FAR	AWAYTEAM_AWAY_GOAL_SO_FAR
goal_so_far = team_predict_pd.apply(fillWithTotalGoalSoFar, data=current_season_result, axis=1)
goal_so_far

0    [27, 15, 14, 22]
1    [13, 27, 27, 12]
2    [17, 17, 35, 16]
3    [24, 19, 33, 14]
4      [24, 25, 0, 0]
5    [19, 14, 18, 15]
6    [20, 16, 15, 14]
7    [31, 13, 18, 17]
8    [22, 11, 25, 19]
9    [18, 22, 19, 21]
dtype: object

In [82]:
goal_so_far_list = np.array(goal_so_far.values.tolist()) 
goal_so_far_list

array([[27, 15, 14, 22],
       [13, 27, 27, 12],
       [17, 17, 35, 16],
       [24, 19, 33, 14],
       [24, 25,  0,  0],
       [19, 14, 18, 15],
       [20, 16, 15, 14],
       [31, 13, 18, 17],
       [22, 11, 25, 19],
       [18, 22, 19, 21]])

In [83]:
goal_so_far_pd = pd.DataFrame(goal_so_far_list, columns=["HOMETEAM_HOME_GOAL_SO_FAR", "HOMETEAM_AWAY_GOAL_SO_FAR", "AWAYTEAM_HOME_GOAL_SO_FAR", "AWAYTEAM_AWAY_GOAL_SO_FAR"])    # convert to dataframe
team_predict_pd.insert(loc=7, column="HOMETEAM_HOME_GOAL_SO_FAR", value=goal_so_far_pd["HOMETEAM_HOME_GOAL_SO_FAR"].astype('Int64')) 
team_predict_pd.insert(loc=8, column="HOMETEAM_AWAY_GOAL_SO_FAR", value=goal_so_far_pd["HOMETEAM_AWAY_GOAL_SO_FAR"].astype('Int64')) 
team_predict_pd.insert(loc=9, column="AWAYTEAM_HOME_GOAL_SO_FAR", value=goal_so_far_pd["AWAYTEAM_HOME_GOAL_SO_FAR"].astype('Int64'))     
team_predict_pd.insert(loc=10, column="AWAYTEAM_AWAY_GOAL_SO_FAR", value=goal_so_far_pd["AWAYTEAM_AWAY_GOAL_SO_FAR"].astype('Int64'))   
team_predict_pd

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,Elo_home,Elo_away,HOMETEAM_HOME_GOAL_SO_FAR,HOMETEAM_AWAY_GOAL_SO_FAR,AWAYTEAM_HOME_GOAL_SO_FAR,AWAYTEAM_AWAY_GOAL_SO_FAR
0,serie_b,2022,33,Cremonese,US Alessandria,64,51,27,15,14,22
1,serie_b,2022,33,Pordenone,Frosinone,44,58,13,27,27,12
2,serie_b,2022,33,Reggina,Benevento,44,60,17,17,35,16
3,serie_b,2022,33,Ternana Calcio,Lecce,53,65,24,19,33,14
4,serie_b,2022,33,SPAL,Cosenza,57,45,24,25,0,0
5,serie_b,2022,33,Parma,Como,56,44,19,14,18,15
6,serie_b,2022,33,Cittadella,Perugia,57,62,20,16,15,14
7,serie_b,2022,33,AC Monza,Ascoli,64,52,31,13,18,17
8,serie_b,2022,33,Pisa SC,Brescia,57,61,22,11,25,19
9,serie_b,2022,33,Vicenza,Crotone,46,56,18,22,19,21


In [84]:

def findRecentPreviousRounds(currentRound, limit):
    if currentRound<=limit:
        return None
    else:
        r = []
        for l in range(limit):
            r.append(currentRound - (limit-l))
        return r


def findLeagueSeasonTeamRecentPreviousRounds(data, team, round):
    rounds = findRecentPreviousRounds(round, RECENT_PREFORMANCE_MATCH_COUNT)         # can change for optimization
    if rounds is None:
        return None

    previous_matches_pd =  data[((data["Home_Team"]==team) | (data["Away_Team"]==team)) & (data["Round"].isin(rounds))]
    recent_perf = 0
    for index, row in previous_matches_pd.iterrows():
        hteam = row['Home_Team']
        ateam = row['Away_Team']
        if hteam==team:
            recent_perf = recent_perf + (row['Home_Score']-row['Away_Score'])
        else:
            recent_perf = recent_perf + (row['Away_Score']-row['Home_Score'])

    return recent_perf


def fillWithRecentPerformance(record, data):
    # get home team and away team and round
    round = record['Round']
    hteam = record['Home_Team']
    ateam = record['Away_Team']
    
    home_team_goal_diff = findLeagueSeasonTeamRecentPreviousRounds(data, hteam, round)
    away_team_goal_diff = findLeagueSeasonTeamRecentPreviousRounds(data, ateam, round)

    return [home_team_goal_diff, away_team_goal_diff]


In [85]:
# HOME_LASTEST_GOAL_DIFF	AWAY_LASTEST_GOAL_DIFF
recent_perform = team_predict_pd.apply(fillWithRecentPerformance, data=current_season_result, axis=1)
recent_perform

0     [3, -6]
1    [-2, -1]
2      [1, 2]
3      [4, 1]
4     [-2, 0]
5      [3, 3]
6    [-2, -1]
7      [2, 1]
8     [-5, 2]
9     [-1, 1]
dtype: object

In [86]:
perf_list = np.array(recent_perform.values.tolist())
home_away_perf_pd = pd.DataFrame(perf_list, columns=["HOME_LASTEST_GOAL_DIFF", "AWAY_LASTEST_GOAL_DIFF"])
team_predict_pd.insert(loc=11, column="HOME_LASTEST_GOAL_DIFF", value=home_away_perf_pd["HOME_LASTEST_GOAL_DIFF"].astype('Int64')) 
team_predict_pd.insert(loc=12, column="AWAY_LASTEST_GOAL_DIFF", value=home_away_perf_pd["AWAY_LASTEST_GOAL_DIFF"].astype('Int64')) 
team_predict_pd

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,Elo_home,Elo_away,HOMETEAM_HOME_GOAL_SO_FAR,HOMETEAM_AWAY_GOAL_SO_FAR,AWAYTEAM_HOME_GOAL_SO_FAR,AWAYTEAM_AWAY_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF
0,serie_b,2022,33,Cremonese,US Alessandria,64,51,27,15,14,22,3,-6
1,serie_b,2022,33,Pordenone,Frosinone,44,58,13,27,27,12,-2,-1
2,serie_b,2022,33,Reggina,Benevento,44,60,17,17,35,16,1,2
3,serie_b,2022,33,Ternana Calcio,Lecce,53,65,24,19,33,14,4,1
4,serie_b,2022,33,SPAL,Cosenza,57,45,24,25,0,0,-2,0
5,serie_b,2022,33,Parma,Como,56,44,19,14,18,15,3,3
6,serie_b,2022,33,Cittadella,Perugia,57,62,20,16,15,14,-2,-1
7,serie_b,2022,33,AC Monza,Ascoli,64,52,31,13,18,17,2,1
8,serie_b,2022,33,Pisa SC,Brescia,57,61,22,11,25,19,-5,2
9,serie_b,2022,33,Vicenza,Crotone,46,56,18,22,19,21,-1,1


In [87]:
team_predict_pd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 9
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   League                     10 non-null     object
 1   Season                     10 non-null     int64 
 2   Round                      10 non-null     int64 
 3   Home_Team                  10 non-null     object
 4   Away_Team                  10 non-null     object
 5   Elo_home                   10 non-null     int64 
 6   Elo_away                   10 non-null     int64 
 7   HOMETEAM_HOME_GOAL_SO_FAR  10 non-null     Int64 
 8   HOMETEAM_AWAY_GOAL_SO_FAR  10 non-null     Int64 
 9   AWAYTEAM_HOME_GOAL_SO_FAR  10 non-null     Int64 
 10  AWAYTEAM_AWAY_GOAL_SO_FAR  10 non-null     Int64 
 11  HOME_LASTEST_GOAL_DIFF     10 non-null     Int64 
 12  AWAY_LASTEST_GOAL_DIFF     10 non-null     Int64 
dtypes: Int64(6), int64(4), object(3)
memory usage: 1.2+ KB


In [88]:
def get_ELO_diff(record):
    hscore = record['Elo_home']
    ascore = record['Elo_away']
    return (hscore - ascore)

In [89]:
def get_recent_goal_diff_diff(record):
    hscore = record['HOME_LASTEST_GOAL_DIFF']
    ascore = record['AWAY_LASTEST_GOAL_DIFF']
    return hscore - ascore

In [90]:
def get_home_away_total_goal_diff(record):
    hgoal = record['HOMETEAM_HOME_GOAL_SO_FAR']
    agoal = record['AWAYTEAM_AWAY_GOAL_SO_FAR']
    return hgoal - agoal

In [91]:
elo_diff_pd = team_predict_pd.apply(get_ELO_diff, axis=1)
team_predict_pd.drop('Elo_home', inplace=True, axis=1)
team_predict_pd.drop('Elo_away', inplace=True, axis=1)
team_predict_pd.insert(loc=5, column="ELO_DIFF", value=elo_diff_pd.astype('Int64')) 
        
recent_perf_diff_pd = team_predict_pd.apply(get_recent_goal_diff_diff, axis=1)
team_predict_pd.drop('HOME_LASTEST_GOAL_DIFF', inplace=True, axis=1)
team_predict_pd.drop('AWAY_LASTEST_GOAL_DIFF', inplace=True, axis=1)
team_predict_pd.insert(loc=6, column="RECENT_PERF_DIFF", value=recent_perf_diff_pd.astype('Int64')) 

goal_diff_pd = team_predict_pd.apply(get_home_away_total_goal_diff, axis=1)
team_predict_pd.drop('HOMETEAM_HOME_GOAL_SO_FAR', inplace=True, axis=1)
team_predict_pd.drop('HOMETEAM_AWAY_GOAL_SO_FAR', inplace=True, axis=1)
team_predict_pd.drop('AWAYTEAM_HOME_GOAL_SO_FAR', inplace=True, axis=1)
team_predict_pd.drop('AWAYTEAM_AWAY_GOAL_SO_FAR', inplace=True, axis=1)
team_predict_pd.insert(loc=7, column="HOME_AWAY_GOAL_DIFF", value=recent_perf_diff_pd.astype('Int64')) 

# delete no value column
team_predict_pd.drop('League', inplace=True, axis=1)
team_predict_pd.drop('Season', inplace=True, axis=1)
team_predict_pd.drop('Round', inplace=True, axis=1)
team_predict_pd.drop('Home_Team', inplace=True, axis=1)
team_predict_pd.drop('Away_Team', inplace=True, axis=1)

team_predict_pd

Unnamed: 0,ELO_DIFF,RECENT_PERF_DIFF,HOME_AWAY_GOAL_DIFF
0,13,9,9
1,-14,-1,-1
2,-16,-1,-1
3,-12,3,3
4,12,-2,-2
5,12,0,0
6,-5,-1,-1
7,12,1,1
8,-4,-7,-7
9,-10,-2,-2


In [107]:
# export to csv
team_predict_pd.to_csv('to_predict.csv', index=False)

array([0, 0, 0, 0, 1, 1, 0, 1, 1, 0])