This task will scrape data of matches that haven't taken place for making predictions. Maybe when you are going through this task, there are no matches available, so instead, use the datasets included in the following files:

Results 
To Predict  Results.zip contains the results of matches that have already taken place right before the matches that haven't taken place yet with the same format as you have seen in the previous datasets.

To_Predict.zip contains the results of matches that haven't taken place yet. You can see that these csv files do not contain the result, since it's the column you will have to predict.

Save the whole data of Results.zip into a single csv called results_for_prediction.csv.

In [37]:
import re
import pandas as pd
import os
import numpy as np
from csv import reader
import plotly.express as px
import missingno as msno
import pickle

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

pd.options.mode.chained_assignment = None

In [38]:
# define recent matches meaning
RECENT_PREFORMANCE_MATCH_COUNT = 3

In [27]:
"""
def getLeagueSeasonTeamBeforeRoundTotalGoal(data, league, season, team, round):
    # determine home or away and get the score 
    # get home game of the team
    home_pd = data[(data["League"]==league) & (data["Home_Team"]==team) & (data["Season"]==season) & (data["Round"]<round)]
    home_total_score = home_pd['Home_Score'].astype('Int64').sum()

    # get away game of the team
    away_pd = data[(data["League"]==league) & (data["Away_Team"]==team) & (data["Season"]==season) & (data["Round"]<round)]
    away_total_score = home_pd['Away_Score'].astype('Int64').sum()

    # calculate total goals
    return home_total_score, away_total_score


def fillWithTotalGoalSoFar(record, data):
    # get home team and away team and round
    league = record['League']
    season = record['Season']
    round = record['Round']
    hteam = record['Home_Team']
    ateam = record['Away_Team']
    
    hometeam_home_goal_so_far, hometeam_away_goal_so_far = getLeagueSeasonTeamBeforeRoundTotalGoal(data, league, season, hteam, round)
    awayteam_home_goal_so_far, awayteam_away_goal_so_far = getLeagueSeasonTeamBeforeRoundTotalGoal(data, league, season, ateam, round)

    return [hometeam_home_goal_so_far, hometeam_away_goal_so_far, awayteam_home_goal_so_far, awayteam_away_goal_so_far]

"""

In [28]:
"""
def findRecentPreviousRounds(currentRound, limit):
    if currentRound<=limit:
        return None
    else:
        r = []
        for l in range(limit):
            r.append(currentRound - (limit-l))
        return r


def findLeagueSeasonTeamRecentPreviousRounds(data, league, season, team, round):
    rounds = findRecentPreviousRounds(round, RECENT_PREFORMANCE_MATCH_COUNT)         # can change for optimization
    if rounds is None:
        return None

    previous_matches_pd =  data[(data["League"]==league) & ((data["Home_Team"]==team) | (data["Away_Team"]==team)) & (data["Season"]==season) & (data["Round"].isin(rounds))]
    recent_perf = 0
    for index, row in previous_matches_pd.iterrows():
        hteam = row['Home_Team']
        ateam = row['Away_Team']
        if hteam==team:
            recent_perf = recent_perf + (row['Home_Score']-row['Away_Score'])
        else:
            recent_perf = recent_perf + (row['Away_Score']-row['Home_Score'])

    return recent_perf


def fillWithRecentPerformance(record, data):
    # get home team and away team and round
    league = record['League']
    season = record['Season']
    round = record['Round']
    hteam = record['Home_Team']
    ateam = record['Away_Team']
    
    home_team_goal_diff = findLeagueSeasonTeamRecentPreviousRounds(data, league, season, hteam, round)
    away_team_goal_diff = findLeagueSeasonTeamRecentPreviousRounds(data, league, season, ateam, round)

    return [home_team_goal_diff, away_team_goal_diff]

"""

In [39]:
# load all directory as league name list
dir = "./Predict/Results/"
leagues = [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir, name))]

# loop to open csv
result_with_goal_sofar_pd = pd.DataFrame()
for league in leagues:
    league_folder = os.path.join(dir, league)

    csv_file_for_league = [os.path.join(league_folder, name) for name in os.listdir(league_folder) if name.endswith('.csv')]
    pkl_file_for_league = [os.path.join(league_folder, name) for name in os.listdir(league_folder) if name.endswith('.pkl')]

    if len(csv_file_for_league)==1 and len(pkl_file_for_league)==1:
        csv_filename = csv_file_for_league[0]
        pkl_filename = pkl_file_for_league[0]

        current_league_season_pd = pd.read_csv(csv_filename, skiprows=[0], names=["Home_Team", "Away_Team", "Result", "Link", "Season", "Round", "League"])

        # Divide result into home_score and away_score
        df_score =  current_league_season_pd['Result'].str.extract(r'(\d)-(\d)')
        current_league_season_pd.insert(loc=3, column="Home_Score", value=df_score[0].astype('Int64'))     # use Int64 as it support NaN
        current_league_season_pd.insert(loc=4, column="Away_Score", value=df_score[1].astype('Int64')) 

        if len(current_league_season_pd)>0:

            # get home team and away team total goal so far
            #home_away_total_goal_sofar = current_league_season_pd.apply(fillWithTotalGoalSoFar, data=current_league_season_pd, axis=1)
            #goal_so_far_list = np.array(home_away_total_goal_sofar.values.tolist())         # convert to list
            #home_away_total_goal_sofar_pd = pd.DataFrame(goal_so_far_list, columns=["HOMETEAM_HOME_GOAL_SO_FAR", "HOMETEAM_AWAY_GOAL_SO_FAR", "AWAYTEAM_HOME_GOAL_SO_FAR", "AWAYTEAM_AWAY_GOAL_SO_FAR"])    # convert to dataframe
            #current_league_season_pd.insert(loc=5, column="HOMETEAM_HOME_GOAL_SO_FAR", value=home_away_total_goal_sofar_pd["HOMETEAM_HOME_GOAL_SO_FAR"].astype('Int64')) 
            #current_league_season_pd.insert(loc=6, column="HOMETEAM_AWAY_GOAL_SO_FAR", value=home_away_total_goal_sofar_pd["HOMETEAM_AWAY_GOAL_SO_FAR"].astype('Int64')) 
            #current_league_season_pd.insert(loc=7, column="AWAYTEAM_HOME_GOAL_SO_FAR", value=home_away_total_goal_sofar_pd["AWAYTEAM_HOME_GOAL_SO_FAR"].astype('Int64'))     
            #current_league_season_pd.insert(loc=8, column="AWAYTEAM_AWAY_GOAL_SO_FAR", value=home_away_total_goal_sofar_pd["AWAYTEAM_AWAY_GOAL_SO_FAR"].astype('Int64'))     

            # get recent performance
            #home_away_recent_perf = current_league_season_pd.apply(fillWithRecentPerformance, data=current_league_season_pd, axis=1)
            #perf_list = np.array(home_away_recent_perf.values.tolist())
            #home_away_perf_pd = pd.DataFrame(perf_list, columns=["HOME_LASTEST_GOAL_DIFF", "AWAY_LASTEST_GOAL_DIFF"])
            #current_league_season_pd.insert(loc=9, column="HOME_LASTEST_GOAL_DIFF", value=home_away_perf_pd["HOME_LASTEST_GOAL_DIFF"].astype('Int64')) 
            #current_league_season_pd.insert(loc=10, column="AWAY_LASTEST_GOAL_DIFF", value=home_away_perf_pd["AWAY_LASTEST_GOAL_DIFF"].astype('Int64')) 

            # load pickle and read content
            d = pickle.load(open(pkl_filename, 'rb'))
            elo_key_df = pd.DataFrame(d.keys(), columns=["link"])
            elo_val_df = pd.DataFrame.from_dict(d.values())
            elo_df = elo_key_df.join(elo_val_df)

            current_league_season_pd = current_league_season_pd.merge(elo_df, left_on='Link', right_on='link')

            result_with_goal_sofar_pd = pd.concat([result_with_goal_sofar_pd, current_league_season_pd])

result_with_goal_sofar_pd

Unnamed: 0,Home_Team,Away_Team,Result,Home_Score,Away_Score,Link,Season,Round,League,link,Elo_home,Elo_away
0,AFC Bournemouth,West Bromwich Albion,2-2,2,2,https://www.besoccer.com/match/afc-bournemouth...,2022,1,championship,https://www.besoccer.com/match/afc-bournemouth...,69,61
1,Blackburn Rovers,Swansea City,2-1,2,1,https://www.besoccer.com/match/blackburn-rover...,2022,1,championship,https://www.besoccer.com/match/blackburn-rover...,58,67
2,Bristol City,Blackpool,1-1,1,1,https://www.besoccer.com/match/bristol-city-fc...,2022,1,championship,https://www.besoccer.com/match/bristol-city-fc...,56,45
3,Cardiff City,Barnsley,1-1,1,1,https://www.besoccer.com/match/cardiff-city-fc...,2022,1,championship,https://www.besoccer.com/match/cardiff-city-fc...,60,52
4,Derby County,Huddersfield Town,1-1,1,1,https://www.besoccer.com/match/derby-county-fc...,2022,1,championship,https://www.besoccer.com/match/derby-county-fc...,57,55
...,...,...,...,...,...,...,...,...,...,...,...,...
315,Nuova Cosenza,Parma,1-3,1,3,https://www.besoccer.com/match/nuova-cosenza/p...,2022,32,serie_b,https://www.besoccer.com/match/nuova-cosenza/p...,44,39
316,Benevento,Pisa SC,5-1,5,1,https://www.besoccer.com/match/benevento-calci...,2022,32,serie_b,https://www.besoccer.com/match/benevento-calci...,52,50
317,Brescia,Vicenza,2-0,2,0,https://www.besoccer.com/match/brescia/vicenza...,2022,32,serie_b,https://www.besoccer.com/match/brescia/vicenza...,58,56
318,Como,AC Monza,2-0,2,0,https://www.besoccer.com/match/como/ac-monza-b...,2022,32,serie_b,https://www.besoccer.com/match/como/ac-monza-b...,42,32


In [40]:
full_pd = result_with_goal_sofar_pd.dropna()
full_pd

Unnamed: 0,Home_Team,Away_Team,Result,Home_Score,Away_Score,Link,Season,Round,League,link,Elo_home,Elo_away
0,AFC Bournemouth,West Bromwich Albion,2-2,2,2,https://www.besoccer.com/match/afc-bournemouth...,2022,1,championship,https://www.besoccer.com/match/afc-bournemouth...,69,61
1,Blackburn Rovers,Swansea City,2-1,2,1,https://www.besoccer.com/match/blackburn-rover...,2022,1,championship,https://www.besoccer.com/match/blackburn-rover...,58,67
2,Bristol City,Blackpool,1-1,1,1,https://www.besoccer.com/match/bristol-city-fc...,2022,1,championship,https://www.besoccer.com/match/bristol-city-fc...,56,45
3,Cardiff City,Barnsley,1-1,1,1,https://www.besoccer.com/match/cardiff-city-fc...,2022,1,championship,https://www.besoccer.com/match/cardiff-city-fc...,60,52
4,Derby County,Huddersfield Town,1-1,1,1,https://www.besoccer.com/match/derby-county-fc...,2022,1,championship,https://www.besoccer.com/match/derby-county-fc...,57,55
...,...,...,...,...,...,...,...,...,...,...,...,...
315,Nuova Cosenza,Parma,1-3,1,3,https://www.besoccer.com/match/nuova-cosenza/p...,2022,32,serie_b,https://www.besoccer.com/match/nuova-cosenza/p...,44,39
316,Benevento,Pisa SC,5-1,5,1,https://www.besoccer.com/match/benevento-calci...,2022,32,serie_b,https://www.besoccer.com/match/benevento-calci...,52,50
317,Brescia,Vicenza,2-0,2,0,https://www.besoccer.com/match/brescia/vicenza...,2022,32,serie_b,https://www.besoccer.com/match/brescia/vicenza...,58,56
318,Como,AC Monza,2-0,2,0,https://www.besoccer.com/match/como/ac-monza-b...,2022,32,serie_b,https://www.besoccer.com/match/como/ac-monza-b...,42,32


In [41]:
full_pd.count()

Home_Team     4243
Away_Team     4243
Result        4243
Home_Score    4243
Away_Score    4243
Link          4243
Season        4243
Round         4243
League        4243
link          4243
Elo_home      4243
Elo_away      4243
dtype: int64

In [42]:
# delete no value column
full_pd.drop('Result', inplace=True, axis=1)
full_pd.drop('Link', inplace=True, axis=1)
full_pd.drop('link', inplace=True, axis=1)
full_pd

Unnamed: 0,Home_Team,Away_Team,Home_Score,Away_Score,Season,Round,League,Elo_home,Elo_away
0,AFC Bournemouth,West Bromwich Albion,2,2,2022,1,championship,69,61
1,Blackburn Rovers,Swansea City,2,1,2022,1,championship,58,67
2,Bristol City,Blackpool,1,1,2022,1,championship,56,45
3,Cardiff City,Barnsley,1,1,2022,1,championship,60,52
4,Derby County,Huddersfield Town,1,1,2022,1,championship,57,55
...,...,...,...,...,...,...,...,...,...
315,Nuova Cosenza,Parma,1,3,2022,32,serie_b,44,39
316,Benevento,Pisa SC,5,1,2022,32,serie_b,52,50
317,Brescia,Vicenza,2,0,2022,32,serie_b,58,56
318,Como,AC Monza,2,0,2022,32,serie_b,42,32


In [43]:
# find who win H:Home A:Away D:Draw
def get_result(record):
    hscore = record['Home_Score']
    ascore = record['Away_Score']
    if hscore is pd.NA or ascore is pd.NA:
        return pd.NA
    if hscore>ascore:
        return 1
    else:
        return 0

result_pd = full_pd.apply(get_result, axis=1)

#full_pd.drop('Home_Score', inplace=True, axis=1)
#full_pd.drop('Away_Score', inplace=True, axis=1)

full_pd.insert(loc=len(full_pd.columns), column="Result", value=result_pd.astype('Int64')) 
full_pd

Unnamed: 0,Home_Team,Away_Team,Home_Score,Away_Score,Season,Round,League,Elo_home,Elo_away,Result
0,AFC Bournemouth,West Bromwich Albion,2,2,2022,1,championship,69,61,0
1,Blackburn Rovers,Swansea City,2,1,2022,1,championship,58,67,1
2,Bristol City,Blackpool,1,1,2022,1,championship,56,45,0
3,Cardiff City,Barnsley,1,1,2022,1,championship,60,52,0
4,Derby County,Huddersfield Town,1,1,2022,1,championship,57,55,0
...,...,...,...,...,...,...,...,...,...,...
315,Nuova Cosenza,Parma,1,3,2022,32,serie_b,44,39,0
316,Benevento,Pisa SC,5,1,2022,32,serie_b,52,50,1
317,Brescia,Vicenza,2,0,2022,32,serie_b,58,56,1
318,Como,AC Monza,2,0,2022,32,serie_b,42,32,1


In [44]:
# reorder dataframe column
full_pd.insert(0, 'League', full_pd.pop('League'))
full_pd.insert(1, 'Season', full_pd.pop('Season'))
full_pd.insert(2, 'Round', full_pd.pop('Round'))
full_pd.insert(3, 'Home_Team', full_pd.pop('Home_Team'))
full_pd.insert(4, 'Away_Team', full_pd.pop('Away_Team'))
full_pd

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,Home_Score,Away_Score,Elo_home,Elo_away,Result
0,championship,2022,1,AFC Bournemouth,West Bromwich Albion,2,2,69,61,0
1,championship,2022,1,Blackburn Rovers,Swansea City,2,1,58,67,1
2,championship,2022,1,Bristol City,Blackpool,1,1,56,45,0
3,championship,2022,1,Cardiff City,Barnsley,1,1,60,52,0
4,championship,2022,1,Derby County,Huddersfield Town,1,1,57,55,0
...,...,...,...,...,...,...,...,...,...,...
315,serie_b,2022,32,Nuova Cosenza,Parma,1,3,44,39,0
316,serie_b,2022,32,Benevento,Pisa SC,5,1,52,50,1
317,serie_b,2022,32,Brescia,Vicenza,2,0,58,56,1
318,serie_b,2022,32,Como,AC Monza,2,0,42,32,1


In [45]:
full_pd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4243 entries, 0 to 319
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   League      4243 non-null   object
 1   Season      4243 non-null   int64 
 2   Round       4243 non-null   int64 
 3   Home_Team   4243 non-null   object
 4   Away_Team   4243 non-null   object
 5   Home_Score  4243 non-null   Int64 
 6   Away_Score  4243 non-null   Int64 
 7   Elo_home    4243 non-null   object
 8   Elo_away    4243 non-null   object
 9   Result      4243 non-null   Int64 
dtypes: Int64(3), int64(2), object(5)
memory usage: 377.1+ KB


In [46]:
# export to csv
full_pd.to_csv('results_for_prediction.csv', index=False)