The data you scraped needs to be cleaned. The pipeline you created should be used to clean the data. The cleaned data should be saved as cleaned_results.csv After going through the pipeline, you will be able to get the same features you used for training the model and put the in the files of To_Predict.zip. Save the csv files with the new features as to_predict.csv.

In [259]:
import re
import pandas as pd
import os
import numpy as np
from csv import reader
import plotly.express as px
import missingno as msno
import pickle

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

pd.options.mode.chained_assignment = None

In [260]:
# define recent matches meaning
RECENT_PREFORMANCE_MATCH_COUNT = 3

In [261]:
# functions to filter different league
def getLeagueData(data, league, season=None):
    if season is None:
        league_pd =  data[(data["League"]==league)]
    else:
        league_pd =  data[(data["League"]==league) & (data["Season"]==season)]
    return league_pd

In [262]:
# Read in results_for_prediction.csv
prediction_result_pd = pd.read_csv("results_for_prediction.csv")
prediction_result_pd

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,Home_Score,Away_Score,Elo_home,Elo_away,Result
0,championship,2022,1,AFC Bournemouth,West Bromwich Albion,2,2,69,61,0
1,championship,2022,1,Blackburn Rovers,Swansea City,2,1,58,67,1
2,championship,2022,1,Bristol City,Blackpool,1,1,56,45,0
3,championship,2022,1,Cardiff City,Barnsley,1,1,60,52,0
4,championship,2022,1,Derby County,Huddersfield Town,1,1,57,55,0
...,...,...,...,...,...,...,...,...,...,...
4238,serie_b,2022,32,Nuova Cosenza,Parma,1,3,44,39,0
4239,serie_b,2022,32,Benevento,Pisa SC,5,1,52,50,1
4240,serie_b,2022,32,Brescia,Vicenza,2,0,58,56,1
4241,serie_b,2022,32,Como,AC Monza,2,0,42,32,1


In [263]:
def getLeagueSeasonTeamBeforeRoundTotalGoal(data, team, round):
    # determine home or away and get the score 
    # get home game of the team
    home_pd = data[(data["Home_Team"]==team) & (data["Round"]<round)]
    home_total_score = home_pd['Home_Score'].astype('Int64').sum()

    # get away game of the team
    away_pd = data[(data["Away_Team"]==team) & (data["Round"]<round)]
    away_total_score = home_pd['Away_Score'].astype('Int64').sum()

    # calculate total goals
    return home_total_score, away_total_score


def fillWithTotalGoalSoFar(record, data):
    # get home team and away team and round
    round = record['Round']
    hteam = record['Home_Team']
    ateam = record['Away_Team']
    
    hometeam_home_goal_so_far, hometeam_away_goal_so_far = getLeagueSeasonTeamBeforeRoundTotalGoal(data, hteam, round)
    awayteam_home_goal_so_far, awayteam_away_goal_so_far = getLeagueSeasonTeamBeforeRoundTotalGoal(data, ateam, round)

    return [hometeam_home_goal_so_far, hometeam_away_goal_so_far, awayteam_home_goal_so_far, awayteam_away_goal_so_far]

In [264]:
def findRecentPreviousRounds(currentRound, limit):
    if currentRound<=limit:
        return None
    else:
        r = []
        for l in range(limit):
            r.append(currentRound - (limit-l))
        return r


def findLeagueSeasonTeamRecentPreviousRounds(data, team, round):
    rounds = findRecentPreviousRounds(round, RECENT_PREFORMANCE_MATCH_COUNT)         # can change for optimization
    if rounds is None:
        return None

    previous_matches_pd =  data[((data["Home_Team"]==team) | (data["Away_Team"]==team)) & (data["Round"].isin(rounds))]
    recent_perf = 0
    for index, row in previous_matches_pd.iterrows():
        hteam = row['Home_Team']
        ateam = row['Away_Team']
        if hteam==team:
            recent_perf = recent_perf + (row['Home_Score']-row['Away_Score'])
        else:
            recent_perf = recent_perf + (row['Away_Score']-row['Home_Score'])

    return recent_perf


def fillWithRecentPerformance(record, data):
    # get home team and away team and round
    round = record['Round']
    hteam = record['Home_Team']
    ateam = record['Away_Team']
    
    home_team_goal_diff = findLeagueSeasonTeamRecentPreviousRounds(data, hteam, round)
    away_team_goal_diff = findLeagueSeasonTeamRecentPreviousRounds(data, ateam, round)

    return [home_team_goal_diff, away_team_goal_diff]

In [274]:
# load all directory as league name list
dir = "./Predict/To_Predict/"
leagues = [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir, name))]

# loop to open csv
predict_pd = pd.DataFrame()
for league in leagues:
    league_folder = os.path.join(dir, league)

    csv_file_for_league = [os.path.join(league_folder, name) for name in os.listdir(league_folder) if name.endswith('.csv')]
    pkl_file_for_league = [os.path.join(league_folder, name) for name in os.listdir(league_folder) if name.endswith('.pkl')]


    if len(csv_file_for_league)==1 and len(pkl_file_for_league)==1:
        csv_filename = csv_file_for_league[0]
        pkl_filename = pkl_file_for_league[0]

        # ,Home_Team,Away_Team,Link,Season,Round,League
        current_league_predict_pd = pd.read_csv(csv_filename, skiprows=[0], names=["Home_Team", "Away_Team", "Link", "Season", "Round", "League"])

        if len(current_league_predict_pd)>0:
            # load pickle and read content
            d = pickle.load(open(pkl_filename, 'rb'))
            elo_key_df = pd.DataFrame(d.keys(), columns=["link"])
            elo_val_df = pd.DataFrame.from_dict(d.values())
            elo_df = elo_key_df.join(elo_val_df)

            current_league_predict_pd = current_league_predict_pd.merge(elo_df, left_on='Link', right_on='link')

            # get this season data
            current_season_result = getLeagueData(prediction_result_pd, league)

            goal_so_far = current_league_predict_pd.apply(fillWithTotalGoalSoFar, data=current_season_result, axis=1)
            goal_so_far_list = np.array(goal_so_far.values.tolist()) 
            goal_so_far_pd = pd.DataFrame(goal_so_far_list, columns=["HOMETEAM_HOME_GOAL_SO_FAR", "HOMETEAM_AWAY_GOAL_SO_FAR", "AWAYTEAM_HOME_GOAL_SO_FAR", "AWAYTEAM_AWAY_GOAL_SO_FAR"])    # convert to dataframe
            current_league_predict_pd.insert(loc=7, column="HOMETEAM_HOME_GOAL_SO_FAR", value=goal_so_far_pd["HOMETEAM_HOME_GOAL_SO_FAR"].astype('Int64')) 
            current_league_predict_pd.insert(loc=8, column="HOMETEAM_AWAY_GOAL_SO_FAR", value=goal_so_far_pd["HOMETEAM_AWAY_GOAL_SO_FAR"].astype('Int64')) 
            current_league_predict_pd.insert(loc=9, column="AWAYTEAM_HOME_GOAL_SO_FAR", value=goal_so_far_pd["AWAYTEAM_HOME_GOAL_SO_FAR"].astype('Int64'))     
            current_league_predict_pd.insert(loc=10, column="AWAYTEAM_AWAY_GOAL_SO_FAR", value=goal_so_far_pd["AWAYTEAM_AWAY_GOAL_SO_FAR"].astype('Int64'))            

            recent_perform = current_league_predict_pd.apply(fillWithRecentPerformance, data=current_season_result, axis=1)
            perf_list = np.array(recent_perform.values.tolist())
            home_away_perf_pd = pd.DataFrame(perf_list, columns=["HOME_LASTEST_GOAL_DIFF", "AWAY_LASTEST_GOAL_DIFF"])
            current_league_predict_pd.insert(loc=11, column="HOME_LASTEST_GOAL_DIFF", value=home_away_perf_pd["HOME_LASTEST_GOAL_DIFF"].astype('Int64')) 
            current_league_predict_pd.insert(loc=12, column="AWAY_LASTEST_GOAL_DIFF", value=home_away_perf_pd["AWAY_LASTEST_GOAL_DIFF"].astype('Int64'))   

            predict_pd = pd.concat([predict_pd, current_league_predict_pd])

predict_pd

Unnamed: 0,Home_Team,Away_Team,Link,Season,Round,League,link,HOMETEAM_HOME_GOAL_SO_FAR,HOMETEAM_AWAY_GOAL_SO_FAR,AWAYTEAM_HOME_GOAL_SO_FAR,AWAYTEAM_AWAY_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF,Elo_home,Elo_away
0,Hull City,Huddersfield Town,https://www.besoccer.com/match/hull-city/hudde...,2022,40,championship,https://www.besoccer.com/match/hull-city/hudde...,16,25,27,20,0,-5,70,55
1,Blackpool,Nottingham Forest,https://www.besoccer.com/match/blackpool-fc/no...,2022,40,championship,https://www.besoccer.com/match/blackpool-fc/no...,22,19,32,21,0,6,39,57
2,AFC Bournemouth,Bristol City,https://www.besoccer.com/match/afc-bournemouth...,2022,40,championship,https://www.besoccer.com/match/afc-bournemouth...,35,18,26,27,5,-1,67,53
3,Barnsley,Reading,https://www.besoccer.com/match/barnsley-fc/rea...,2022,40,championship,https://www.besoccer.com/match/barnsley-fc/rea...,16,21,28,37,0,-3,50,60
4,Cardiff City,Swansea City,https://www.besoccer.com/match/cardiff-city-fc...,2022,40,championship,https://www.besoccer.com/match/cardiff-city-fc...,21,23,23,20,1,0,57,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,Parma,Como,https://www.besoccer.com/match/parma-fc/como/2...,2022,33,serie_b,https://www.besoccer.com/match/parma-fc/como/2...,19,14,18,15,3,3,56,44
6,Cittadella,Perugia,https://www.besoccer.com/match/as-cittadella/p...,2022,33,serie_b,https://www.besoccer.com/match/as-cittadella/p...,20,16,15,14,-2,-1,57,62
7,AC Monza,Ascoli,https://www.besoccer.com/match/ac-monza-brianz...,2022,33,serie_b,https://www.besoccer.com/match/ac-monza-brianz...,31,13,18,17,2,1,64,52
8,Pisa SC,Brescia,https://www.besoccer.com/match/pisa-calcio/bre...,2022,33,serie_b,https://www.besoccer.com/match/pisa-calcio/bre...,22,11,25,19,-5,2,57,61


In [275]:
#predict_pd.drop('Link', inplace=True, axis=1)
predict_pd.drop('link', inplace=True, axis=1)
predict_pd

Unnamed: 0,Home_Team,Away_Team,Link,Season,Round,League,HOMETEAM_HOME_GOAL_SO_FAR,HOMETEAM_AWAY_GOAL_SO_FAR,AWAYTEAM_HOME_GOAL_SO_FAR,AWAYTEAM_AWAY_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF,Elo_home,Elo_away
0,Hull City,Huddersfield Town,https://www.besoccer.com/match/hull-city/hudde...,2022,40,championship,16,25,27,20,0,-5,70,55
1,Blackpool,Nottingham Forest,https://www.besoccer.com/match/blackpool-fc/no...,2022,40,championship,22,19,32,21,0,6,39,57
2,AFC Bournemouth,Bristol City,https://www.besoccer.com/match/afc-bournemouth...,2022,40,championship,35,18,26,27,5,-1,67,53
3,Barnsley,Reading,https://www.besoccer.com/match/barnsley-fc/rea...,2022,40,championship,16,21,28,37,0,-3,50,60
4,Cardiff City,Swansea City,https://www.besoccer.com/match/cardiff-city-fc...,2022,40,championship,21,23,23,20,1,0,57,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,Parma,Como,https://www.besoccer.com/match/parma-fc/como/2...,2022,33,serie_b,19,14,18,15,3,3,56,44
6,Cittadella,Perugia,https://www.besoccer.com/match/as-cittadella/p...,2022,33,serie_b,20,16,15,14,-2,-1,57,62
7,AC Monza,Ascoli,https://www.besoccer.com/match/ac-monza-brianz...,2022,33,serie_b,31,13,18,17,2,1,64,52
8,Pisa SC,Brescia,https://www.besoccer.com/match/pisa-calcio/bre...,2022,33,serie_b,22,11,25,19,-5,2,57,61


In [267]:
# Only run for testing, filter only segunda_liga
predict_pd = getLeagueData(predict_pd, "segunda_liga", season=None)

In [276]:
# reorder dataframe column
predict_pd.insert(0, 'League', predict_pd.pop('League'))
predict_pd.insert(1, 'Season', predict_pd.pop('Season'))
predict_pd.insert(2, 'Round', predict_pd.pop('Round'))
predict_pd.insert(3, 'Home_Team', predict_pd.pop('Home_Team'))
predict_pd.insert(4, 'Away_Team', predict_pd.pop('Away_Team'))
predict_pd.insert(5, 'Elo_home', predict_pd.pop('Elo_home').astype('int'))
predict_pd.insert(6, 'Elo_away', predict_pd.pop('Elo_away').astype('int'))
predict_pd.insert(13, 'Link', predict_pd.pop('Link'))

predict_pd

Unnamed: 0,League,Season,Round,Home_Team,Away_Team,Elo_home,Elo_away,HOMETEAM_HOME_GOAL_SO_FAR,HOMETEAM_AWAY_GOAL_SO_FAR,AWAYTEAM_HOME_GOAL_SO_FAR,AWAYTEAM_AWAY_GOAL_SO_FAR,HOME_LASTEST_GOAL_DIFF,AWAY_LASTEST_GOAL_DIFF,Link
0,championship,2022,40,Hull City,Huddersfield Town,70,55,16,25,27,20,0,-5,https://www.besoccer.com/match/hull-city/hudde...
1,championship,2022,40,Blackpool,Nottingham Forest,39,57,22,19,32,21,0,6,https://www.besoccer.com/match/blackpool-fc/no...
2,championship,2022,40,AFC Bournemouth,Bristol City,67,53,35,18,26,27,5,-1,https://www.besoccer.com/match/afc-bournemouth...
3,championship,2022,40,Barnsley,Reading,50,60,16,21,28,37,0,-3,https://www.besoccer.com/match/barnsley-fc/rea...
4,championship,2022,40,Cardiff City,Swansea City,57,75,21,23,23,20,1,0,https://www.besoccer.com/match/cardiff-city-fc...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,serie_b,2022,33,Parma,Como,56,44,19,14,18,15,3,3,https://www.besoccer.com/match/parma-fc/como/2...
6,serie_b,2022,33,Cittadella,Perugia,57,62,20,16,15,14,-2,-1,https://www.besoccer.com/match/as-cittadella/p...
7,serie_b,2022,33,AC Monza,Ascoli,64,52,31,13,18,17,2,1,https://www.besoccer.com/match/ac-monza-brianz...
8,serie_b,2022,33,Pisa SC,Brescia,57,61,22,11,25,19,-5,2,https://www.besoccer.com/match/pisa-calcio/bre...


In [277]:
predict_pd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138 entries, 0 to 9
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   League                     138 non-null    object
 1   Season                     138 non-null    int64 
 2   Round                      138 non-null    int64 
 3   Home_Team                  138 non-null    object
 4   Away_Team                  138 non-null    object
 5   Elo_home                   138 non-null    int64 
 6   Elo_away                   138 non-null    int64 
 7   HOMETEAM_HOME_GOAL_SO_FAR  138 non-null    Int64 
 8   HOMETEAM_AWAY_GOAL_SO_FAR  138 non-null    Int64 
 9   AWAYTEAM_HOME_GOAL_SO_FAR  138 non-null    Int64 
 10  AWAYTEAM_AWAY_GOAL_SO_FAR  138 non-null    Int64 
 11  HOME_LASTEST_GOAL_DIFF     138 non-null    Int64 
 12  AWAY_LASTEST_GOAL_DIFF     138 non-null    Int64 
 13  Link                       138 non-null    object
dtypes: Int64(6),

In [270]:
def get_ELO_diff(record):
    hscore = record['Elo_home']
    ascore = record['Elo_away']
    return (hscore - ascore)

In [278]:
def get_recent_goal_diff_diff(record):
    hscore = record['HOME_LASTEST_GOAL_DIFF']
    ascore = record['AWAY_LASTEST_GOAL_DIFF']
    return hscore - ascore

In [279]:
def get_home_away_total_goal_diff(record):
    hgoal = record['HOMETEAM_HOME_GOAL_SO_FAR']
    agoal = record['AWAYTEAM_AWAY_GOAL_SO_FAR']
    return hgoal - agoal

In [280]:
elo_diff_pd = predict_pd.apply(get_ELO_diff, axis=1)
predict_pd.drop('Elo_home', inplace=True, axis=1)
predict_pd.drop('Elo_away', inplace=True, axis=1)
predict_pd.insert(loc=5, column="ELO_DIFF", value=elo_diff_pd.astype('Int64')) 
        
recent_perf_diff_pd = predict_pd.apply(get_recent_goal_diff_diff, axis=1)
predict_pd.drop('HOME_LASTEST_GOAL_DIFF', inplace=True, axis=1)
predict_pd.drop('AWAY_LASTEST_GOAL_DIFF', inplace=True, axis=1)
predict_pd.insert(loc=6, column="RECENT_PERF_DIFF", value=recent_perf_diff_pd.astype('Int64')) 

goal_diff_pd = predict_pd.apply(get_home_away_total_goal_diff, axis=1)
predict_pd.drop('HOMETEAM_HOME_GOAL_SO_FAR', inplace=True, axis=1)
predict_pd.drop('HOMETEAM_AWAY_GOAL_SO_FAR', inplace=True, axis=1)
predict_pd.drop('AWAYTEAM_HOME_GOAL_SO_FAR', inplace=True, axis=1)
predict_pd.drop('AWAYTEAM_AWAY_GOAL_SO_FAR', inplace=True, axis=1)
predict_pd.insert(loc=7, column="HOME_AWAY_GOAL_DIFF", value=recent_perf_diff_pd.astype('Int64')) 

# delete no value column
predict_pd.drop('Season', inplace=True, axis=1)
predict_pd.drop('Round', inplace=True, axis=1)
predict_pd.drop('Home_Team', inplace=True, axis=1)
predict_pd.drop('Away_Team', inplace=True, axis=1)
predict_pd.insert(4, 'League', predict_pd.pop('League'))        # need this field for filter

predict_pd

Unnamed: 0,ELO_DIFF,RECENT_PERF_DIFF,HOME_AWAY_GOAL_DIFF,Link,League
0,15,5,5,https://www.besoccer.com/match/hull-city/hudde...,championship
1,-18,-6,-6,https://www.besoccer.com/match/blackpool-fc/no...,championship
2,14,6,6,https://www.besoccer.com/match/afc-bournemouth...,championship
3,-10,3,3,https://www.besoccer.com/match/barnsley-fc/rea...,championship
4,-18,1,1,https://www.besoccer.com/match/cardiff-city-fc...,championship
...,...,...,...,...,...
5,12,0,0,https://www.besoccer.com/match/parma-fc/como/2...,serie_b
6,-5,-1,-1,https://www.besoccer.com/match/as-cittadella/p...,serie_b
7,12,1,1,https://www.besoccer.com/match/ac-monza-brianz...,serie_b
8,-4,-7,-7,https://www.besoccer.com/match/pisa-calcio/bre...,serie_b


In [281]:
import requests
import re

In [282]:
# get final score by the link
def getMatchHomeWin(record):
    url = record["Link"]
    r = requests.get(url)
    x = re.findall(r'<span class="r1">(.*?)</span> - <span class="r2">(.*?)</span>', r.text)
    if (int(x[0][0]) > int(x[0][1])):
        return 1
    else:
        return 0



#isHomeWin = getMatchHomeWin("https://www.besoccer.com/match/trofense/leixoes/202235612")
#print(isHomeWin)



In [283]:
homewin_pd = predict_pd.apply(getMatchHomeWin, axis=1)
homewin_pd

0    0
1    0
2    1
3    0
4    0
    ..
5    1
6    0
7    1
8    0
9    0
Length: 138, dtype: int64

In [284]:
homewin_list = np.array(homewin_pd.values.tolist())
home_win_flag_pd = pd.DataFrame(homewin_list, columns=["HOME_WIN"])
predict_pd.insert(loc=4, column="HOME_WIN", value=home_win_flag_pd["HOME_WIN"].astype('Int64')) 

In [285]:
predict_pd

Unnamed: 0,ELO_DIFF,RECENT_PERF_DIFF,HOME_AWAY_GOAL_DIFF,Link,HOME_WIN,League
0,15,5,5,https://www.besoccer.com/match/hull-city/hudde...,0,championship
1,-18,-6,-6,https://www.besoccer.com/match/blackpool-fc/no...,0,championship
2,14,6,6,https://www.besoccer.com/match/afc-bournemouth...,1,championship
3,-10,3,3,https://www.besoccer.com/match/barnsley-fc/rea...,0,championship
4,-18,1,1,https://www.besoccer.com/match/cardiff-city-fc...,0,championship
...,...,...,...,...,...,...
5,12,0,0,https://www.besoccer.com/match/parma-fc/como/2...,0,serie_b
6,-5,-1,-1,https://www.besoccer.com/match/as-cittadella/p...,1,serie_b
7,12,1,1,https://www.besoccer.com/match/ac-monza-brianz...,0,serie_b
8,-4,-7,-7,https://www.besoccer.com/match/pisa-calcio/bre...,0,serie_b


In [286]:
# export to csv
predict_pd.to_csv('to_predict.csv', index=False)