In [1]:
from selenium.webdriver import Chrome
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from fake_useragent import UserAgent
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from datetime import datetime, date, timedelta
from itertools import chain
from collections import Counter
import time, os, psycopg2, json, requests, re
import pandas as pd

from bs4 import BeautifulSoup
import numpy as np

### Historic Match Scores & Details Extraction (Website: https://www.flashscore.nl/)

In [307]:
def matches_scores(url):
    '''This function extracts the the historic match scores of the teams paired up for an upcoming match
    and transforms the data for further analaysis.'''
    options = Options()
    
    #Sets up a fake browser
    ua = UserAgent()
    userAgent = ua.random
    options.add_argument(f'user-agent={userAgent}')
    options.add_argument('--blink-settings=imagesEnabled=false')
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    #options.add_argument("--disable-dev-sh-usage")
    
    driver = webdriver.Chrome(r'C:\Users\hp\Documents\Our Documents\Personal Development\Projects\Client Projects\Dexter Hadeveld (Upwork)\Enhancing-Sports-Insight\chromedriver-win64\chromedriver.exe',options=options)

    driver.get(url)
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="onetrust-accept-btn-handler"]'))).click()
    time.sleep(3)
    
    #sleeps because the page has to load the data before scraping h2h__showMore showMore
    
    elements = driver.find_elements(By.CLASS_NAME, "showMore")
    for elem in elements:
        elem.click()
    contents =driver.find_elements_by_class_name("h2h")
    
    #gets the data by class name
    content = [con.get_attribute('innerText') for con in contents]
    
    #extracts the content of each H2H page and cleans it
    clean = [item.replace('\n', ',') for item in content]
    clean = [item.split(',') for item in clean]
    clean = list(chain(*clean))
    remove = ['Toon meer wedstrijden']
    clean  = [word for word in clean if word not in remove ]
    
    driver.quit()
    
    #Using the table sub-table heads to split the data between the last 10 matches for the home and away teams and their head2head
    start_indices = [i for i, item in enumerate(clean) if "LAATSTE WEDSTRIJDEN" in item or "HEAD-TO-HEAD" in item]
    end_indices = start_indices[1:] + [len(clean)]
    first_list = clean[start_indices[0]:end_indices[0]]
    second_list = clean[start_indices[1]:end_indices[1]]
    third_list = clean[start_indices[2]:end_indices[2]]

    #Removes the sub-table heads to only have the useful information for the analysis
    first_list = [word for word in first_list if 'LAATSTE WEDSTRIJDEN' not in word]
    second_list = [word for word in second_list if 'LAATSTE WEDSTRIJDEN' not in word]
    third_list = [word for word in third_list if 'HEAD-TO-HEAD' not in word]

    #Adds all the home historic match outcomes to a dictionary
    home_team_matches = {'date': [],'league': [],'home_club': [],'away_club': [],'home_club_goal': [],'away_club_goal': []}
    keys = list(home_team_matches.keys())
    count_1 = 0
    for item in first_list:
        if count_1 <= 5:
            home_team_matches[keys[count_1]].append(item)
            count_1 += 1
        else:
            #Account for instance there are some unexpected values within the data being extracted
            try:
                value = int(item)
                continue
            except:
                count_1 = 0
                continue
    home_team_matches = json.dumps(home_team_matches)

    #Adds all the away historic match outcomes to a dictionary
    away_team_matches = {'date': [],'league': [],'home_club': [],'away_club': [],'home_club_goal': [],'away_club_goal': []}
    keys = list(away_team_matches.keys())
    count_1 = 0
    for item in second_list:
        if count_1 <= 5:
            away_team_matches[keys[count_1]].append(item)
            count_1 += 1
        else:
            #Account for instance there are some unexpected values within the data being extracted
            try:
                value = int(item)
                continue
            except:
                count_1 = 0
                continue
    away_team_matches = json.dumps(away_team_matches)

    #Adds all the head2head historic match outcomes to a dictionary
    head2head_matches = {'date': [],'league': [],'home_club': [],'away_club': [],'home_club_goal': [],'away_club_goal': []}
    keys = list(head2head_matches.keys())
    count_1 = 0
    for item in third_list:
        if count_1 <= 5:
            head2head_matches[keys[count_1]].append(item)
            count_1 += 1
        else:
            #Account for instance there are some unexpected values within the data being extracted
            if len(item) > 3:
                head2head_matches['date'].append(item)
                count_1 = 1
            else:    
                continue
    head2head_matches = json.dumps(head2head_matches)

    #Account for instances where no matches have been played
    return home_team_matches, away_team_matches, head2head_matches   

In [308]:
def activity_times(content):
    '''Takes in the match content and extracts the time (also calculated the overtime)'''
    #print(content)
    activity_time = [con.replace("\n","'") for con in content]
    activity_time = [con.split("'")[0] for con in activity_time]
    for ind in range(len(activity_time)):
        if '+' in activity_time[ind]:
            temp_vars = activity_time[ind].split('+')
            temp_var = int(temp_vars[0])+int(temp_vars[1])
            activity_time[ind] = str(temp_var)
    return(activity_time)

In [309]:
def activity_type(activities):
    '''Takes in the content of the match, cleans it and extract the type of activity'''
    #print(activities)
    activities_list = []
    delim = r'[<>]'
    diction = {'card':'red-yellowcard','red':'redcard','soccer':'goal','substitution':'substitution','var':'var','warning':'penalty(missed)','yellow':'yellowcard'}
    for activity in activities:
        temp_list = [i for i in re.split(delim, activity) if 'svg class' in i]
        
        chosen_activity = []
        for key in diction.keys():
            if key in temp_list[0]:
                #This specifically checks for red-yellow card, as it is difficult to distinguish from red or yellow card by just 'svg class'
                if key == 'card':
                    if ('red' not in temp_list[0]) & ('yellow' not in temp_list[0]):
                        chosen_activity.append(diction['card'])
                else:
                    chosen_activity.append(diction[key])
                    
        activities_list.append(chosen_activity[0])
    return(activities_list)

In [311]:
def matches_details(team, url):
    '''This function simply extracts the inner match details of the historic matches of the teams under analysis'''
    team = team
    url = url
    
    #Sets up a fake browser
    options = Options()
    ua = UserAgent()
    userAgent = ua.random
    options.add_argument(f'user-agent={userAgent}')
    options.add_argument('--blink-settings=imagesEnabled=false')
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    #options.add_argument("--disable-dev-sh-usage")
    
    driver = webdriver.Chrome(r'C:\Users\hp\Documents\Our Documents\Personal Development\Projects\Client Projects\Dexter Hadeveld (Upwork)\Enhancing-Sports-Insight\chromedriver-win64\chromedriver.exe',options=options)

    #Loads up the url using the chromedriver and clicks the cookie prompt
    driver.get(url)
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="onetrust-accept-btn-handler"]'))).click()
    
    #Extracts the data in the table rows (containing links to previous match)
    elements = driver.find_elements(By.CLASS_NAME, "h2h__row")#'Klik voor wedstrijddetails!'
    elem_links = []
          
    list_of_details = {}
    
    #Loops through each match link to extract the url of each page (match details)
    for count in range(len(elements[:5])):
        try:
            #Sets up a fake browser
            options = Options()
            ua = UserAgent()
            userAgent = ua.random
            options.add_argument(f'user-agent={userAgent}')
            options.add_argument('--blink-settings=imagesEnabled=false')
            options.add_argument("--headless")
            options.add_argument("--no-sandbox")
            #options.add_argument("--disable-dev-sh-usage")

            driver = webdriver.Chrome(r'C:\Users\hp\Documents\Our Documents\Personal Development\Projects\Client Projects\Dexter Hadeveld (Upwork)\Enhancing-Sports-Insight\chromedriver-win64\chromedriver.exe',options=options)

            #Loads up the url using the chromedriver and clicks the cookie prompt
            driver.get(url)
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="onetrust-accept-btn-handler"]'))).click()
            
            #Extracts the data in the table rows (containing links to previous match)
            element = driver.find_elements(By.CLASS_NAME, "h2h__row")
            elem = element[count]
            elem.click()
            driver.switch_to.window(driver.window_handles[-1]) #Switches the driver to the new opened page
            current_url = driver.current_url
            #print(current_url)
            elem_links.append(current_url)
            
            #checks which role the team under consideration played as (home or away) in the match being checked
            match_date = driver.find_element_by_class_name('duelParticipant__startTime')
            match_date = match_date.get_attribute('textContent')
            #print(match_date)

            tournament = driver.find_element_by_class_name('tournamentHeader__country')
            tournament = tournament.get_attribute('textContent')
            #print(tournament)
            
            match_score = driver.find_element_by_class_name('detailScore__wrapper')
            match_score = match_score.get_attribute('textContent')
            #print(match_score)

            try:
                home_team = driver.find_element_by_class_name('duelParticipant__home')
                home_teamname = home_team.get_attribute('textContent')
            except:
                home_team = driver.find_element_by_class_name('duelParticipant__home duelParticipant--winner')
                home_teamname = home_team.get_attribute('textContent')
            try:
                away_team = driver.find_element_by_class_name('duelParticipant__away ')
                away_teamname = away_team.get_attribute('textContent')
            except:
                away_team = driver.find_element_by_class_name('duelParticipant__away duelParticipant--winner')
                away_teamname = away_team.get_attribute('textContent')
            #print(home_teamname, away_teamname)
            
            #Based on which role, the class name is decided and used to extract details
            if team in home_teamname:
                class_name_1 = 'smv__participantRow.smv__homeParticipant'
                class_name_2 = 'smv__participantRow.smv__awayParticipant'
            elif team in away_teamname:
                class_name_1 = 'smv__participantRow.smv__awayParticipant'
                class_name_2 = 'smv__participantRow.smv__homeParticipant'   
            #print(class_name_1, class_name_2)
            
            #Extracts the contents fo the match
            contents_1 = driver.find_elements_by_class_name(class_name_1)
            contents_2 = driver.find_elements_by_class_name(class_name_2)
            #print(len(contents_1), len(contents_2))
            
            content_1 = [con.get_attribute('innerText') for con in contents_1]
            activities_1 = [con.get_attribute('innerHTML') for con in contents_1]
            #print(len(content_1), len(activities_1))
            
            content_2 = [con.get_attribute('innerText') for con in contents_2]
            activities_2 = [con.get_attribute('innerHTML') for con in contents_2]
            #print(len(content_2), len(activities_2))
            
            driver.close()

            #Extracts the activities time and type and zips the lists to be processed together
            activities_time_1 = activity_times(content_1)
            activities_list_1 = activity_type(activities_1)
            #print(len(activities_time_1), len(activities_list_1))
            combined_list_1 = list(zip(activities_time_1, activities_list_1))
            #print(combined_list_1)

            #Groups the match activities time by the type of activity (yellow card, substitution etc.)
            details_1 = {'tournament':[tournament],'date':[match_date],'teams':[home_teamname,away_teamname],
                       'match_score':[match_score],'goal':[],'penalty(missed)':[],'redcard':[],'red-yellowcard':[],'substitution':[],
                       'var':[],'yellowcard':[]}
            for entry in combined_list_1:
                details_1[entry[1]].append(entry[0])
            #print(details_1)

            #Extracts the activities time and type and zips the lists to be processed together
            activities_time_2 = activity_times(content_2)
            activities_list_2 = activity_type(activities_2)
            #print(len(activities_time_2), len(activities_list_2))
            combined_list_2 = list(zip(activities_time_2, activities_list_2))
            #print(combined_list_2)
            

            #Groups the match activities time by the type of activity (yellow card, substitution etc.)
            details_2 = {'tournament':[tournament],'date':[match_date],'teams':[home_teamname,away_teamname],
                       'match_score':[match_score],'goal':[],'penalty(missed)':[],'redcard':[],'red-yellowcard':[],'substitution':[],
                       'var':[],'yellowcard':[]}
            for entry in combined_list_2:
                details_2[entry[1]].append(entry[0])
            #print(details_2)

            #Appends to the list of details which will be changed to dictionary
            list_of_details[str(count)] = {}
            list_of_details[str(count)]['team'] = details_1
            list_of_details[str(count)]['opponent'] = details_2
            driver.switch_to.window(driver.window_handles[0])
            driver.close()
        except Exception as e:
            setup = f"{team}:{url} ({count}) (Inner-Match-Det)"
            except_messgs[f"({setup})"] = f"{type(e).__name__}: {e}" #Catches and Records Error
            try:
                driver.close()
                list_of_details[str(count)] = {}
            except:
                list_of_details[str(count)] = {}
                  
    driver.quit()
    list_of_details = json.dumps(list_of_details)

    return list_of_details 

In [312]:
def data_loader(dataset):
    #Extracting the data from the dataframe to load into the database multiple rows at a time
    lim = dataset.shape[0]

    match_data = []
    for i in range(lim):
        match_data.append(dataset.iloc[i,:])

    #PostgreSQL database connection parameters
    connection_params = {
        "host": "ec2-34-251-233-253.eu-west-1.compute.amazonaws.com",
        "port": "5432",
        "database": "d6smqg5adrljks",
        "user": "tzjbmiqqkxkuxu",
        "password": "1630e53a904f24ef0d85affbd499bd6f3cbbc1c43c47ca8d4180ba8841ee7676"
    }

    #Connect to PostgreSQL
    connection = psycopg2.connect(**connection_params)
    cursor = connection.cursor()

    #Create the table in the database
    create_query = '''CREATE TABLE IF NOT EXISTS historic_match (
        date VARCHAR,
        hometeam VARCHAR,
        awayteam VARCHAR,
        match_urls VARCHAR,
        home_urls VARCHAR,
        away_urls VARCHAR,
        league VARCHAR,
        home_team_matches JSONB,
        away_team_matches JSONB,
        head2head_matches JSONB,
        home_team_matchespattern JSONB,
        away_team_matchespattern JSONB
    );'''
    cursor.execute(create_query)

    #Insert all the data into the table multiple rows at a time
    insert_query = "INSERT INTO historic_match (date, hometeam, awayteam, match_urls, home_urls, away_urls, league, home_team_matches, away_team_matches, head2head_matches, home_team_matchespattern, away_team_matchespattern) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"
    cursor.executemany(insert_query, match_data)

    #Commit and close connection
    connection.commit()
    cursor.close()
    connection.close()

In [342]:
def match_extraction(leagues_list, today, tomorrow):
    leagues_dataset = {} #Created the empyt dictionary that will be used to concatenate all table from all leagues
    
    for key in list(leagues_list.keys()):
        league_counter = 0
        try:
            '''To make sure all links load, irrespective of poor network or site loading wrongly,
            add a while loop which checks if a variable has been changed. If site loads properly,
            change variable to exit while loop, but if webiste threw an except (Timeout) message,
            keep variable the same to maintain the while loop until data is gotten'''
            
            league_url = leagues_list[key][0]

            #Sets up a fake browser
            options = Options()
            ua = UserAgent()
            userAgent = ua.random
            options.add_argument(f'user-agent={userAgent}')
            options.add_argument('--blink-settings=imagesEnabled=false')
            options.add_argument("--headless")
            options.add_argument("--no-sandbox")
            #options.add_argument("--disable-dev-sh-usage")

            driver = webdriver.Chrome(r'C:\Users\hp\Documents\Our Documents\Personal Development\Projects\Client Projects\Dexter Hadeveld (Upwork)\Enhancing-Sports-Insight\chromedriver-win64\chromedriver.exe',options=options)

            #Loads up the url using the chromedriver and clicks the cookie prompt
            driver.get(league_url) 
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="onetrust-accept-btn-handler"]'))).click()
            
            #Extracts the contents of the table of scheduled matches
            contents = driver.find_elements_by_xpath('//*[@title ="Klik voor wedstrijddetails!"]')
            content = [con.get_attribute('innerText') for con in contents]
            id_match = [con.get_attribute('id') for con in contents]

            driver.quit() #Closes driver to begin the transformation

            #Transformation: sets up urls to the head2head, home and away team form for previous matches
            clean= [entry.replace('\n-\n-', '').replace('\n', ',').split(',') for entry in content]
            
            #This filters the scheduled matches of any cancelled matches
            zipped_lists = zip(clean, id_match)
            zipped_lists = [pair for pair in zipped_lists if len(pair[0]) == 3]
            clean, id_match = zip(*zipped_lists)
            
            #Prepares the links needed to extract the historic match scores and in-match details
            base_url = 'https://www.flashscore.nl/wedstrijd/'
            overall_urls = [f"{base_url}{string.replace('g_1_', '')}/#/h2h/overall" for string in id_match]
            home_urls = [f"{base_url}{string.replace('g_1_', '')}/#/h2h/home" for string in id_match]
            away_urls = [f"{base_url}{string.replace('g_1_', '')}/#/h2h/away" for string in id_match]
            df = pd.DataFrame(clean, columns=['Date/Time', 'HomeTeam', 'AwayTeam'])
            df['match_urls'] = overall_urls
            df['home_urls'] = home_urls
            df['away_urls'] = away_urls
            
            #Converting the date columns to datetime
            df['Date/Time'] = pd.to_datetime(df['Date/Time'] + '.2023', format='%d.%m. %H:%M.%Y')

            #filters the dataframe prepared using the current date
            today_df = df[(df['Date/Time'].dt.date == today) | (df['Date/Time'].dt.date == tomorrow)]
            today_df = today_df.copy(deep=True)
            curr_league = [key for i in range(len(today_df['match_urls']))]
            today_df['League'] = curr_league

            hometeam_form = []
            awayteam_form = []
            head2head = []
            home_details = []
            away_detials = []

            #for each match url extract the head2head, home team and away team games score for the last 10 recent games
            for i in range(len(list(today_df['match_urls']))):
                match_url = list(today_df['match_urls'])[i]
                setup_1 = f"{list(today_df['HomeTeam'])[i]}:{list(today_df['AwayTeam'])[i]} (Historic Score)"
                try:
                    home_team_matches, away_team_matches, head2head_matches = matches_scores(match_url)
                    hometeam_form.append(home_team_matches)
                    awayteam_form.append(away_team_matches)
                    head2head.append(head2head_matches)
                except Exception as e:
                    except_messgs[str(key)+f": {league_counter} ({setup_1})"] = f"{type(e).__name__}: {e}" #Catches and Records Error
                    empty_json = json.dumps({})
                    hometeam_form.append(empty_json)
                    awayteam_form.append(empty_json)
                    head2head.append(empty_json)
                    league_counter += 1
            
            #extract the match detail (yellow cars, goals, penalties and times etc) for last 10 games by home team
            for i in range(len(list(today_df['home_urls']))):
                home_url = list(today_df['home_urls'])[i]
                home_team = list(today_df['HomeTeam'])[i]
                setup = f"{list(today_df['HomeTeam'])[i]}:{list(today_df['AwayTeam'])[i]} (Home Inner-Det)"
                try:
                    home_team_dets = matches_details(home_team, home_url)
                    home_details.append(home_team_dets)
                except Exception as e:
                    except_messgs[str(key)+f": {league_counter} ({setup})"] = f"{type(e).__name__}: {e}" #Catches and Records Error
                    empty_json = json.dumps({})
                    home_details.append(empty_json)
                    league_counter += 1
            
            #extract the match detail (yellow cars, goals, penalties and times etc) for last 10 games by away team
            for i in range(len(list(today_df['away_urls']))):
                away_url = list(today_df['away_urls'])[i]
                away_team = list(today_df['AwayTeam'])[i]
                setup = f"{list(today_df['HomeTeam'])[i]}:{list(today_df['AwayTeam'])[i]} (Away Inner-Det)"
                try:
                    away_team_dets = matches_details(away_team, away_url)
                    away_detials.append(away_team_dets)
                except Exception as e:
                    except_messgs[str(key)+f": {league_counter} ({setup})"] = f"{type(e).__name__}: {e}" #Catches and Records Error
                    empty_json = json.dumps({})
                    away_detials.append(empty_json)
                    league_counter += 1

            #Add all these extracted data to the dataframe of daily match of the current league being extracted
            today_df['home_team_matches'] = hometeam_form
            today_df['away_team_matches'] = awayteam_form
            today_df['head2head_matches'] = head2head
            today_df['home_team_matchespattern'] = home_details
            today_df['away_team_matchespattern'] = away_detials
            
            
            #Loads the extracted league to the database
            for i in range(2): #Tries twice to load data in case of any unforeseen connection issue
                try:
                    data_loader(today_df) #If try is successful, breaks the loop
                    #print("All daily matches of {} have been loaded!".format(key))
                    break
                except Exception as e:
                    except_messgs[str(key)+f": {league_counter}"] = f"{type(e).__name__}: {e}" #Catches and Records Error
                    if i < 1: #If try isn't successful but it's the first time, then it tries again
                        continue
                    else: #If try isn't successful the second time, it adds the dataframe to the dictionary to try later.
                        leagues_dataset[key] = today_df
                    league_counter += 1
        except Exception as e:
            except_messgs[str(key)+f": {league_counter} (Database Loading)"] = f"{type(e).__name__}: {e}" #Catches and Records Error
            league_counter += 1
            try:
                driver.quit()
                #print("Daily matches of {} couldn't be extracted!".format(key))
                continue
            except:
                #print("Daily matches of {} couldn't be extracted!".format(key))
                continue
            
     
    #All the dataframe of the daily matches for all the leagues extracted are concatenated vertically
    list_of_keys = list(leagues_dataset.keys())
    if len(list_of_keys) > 0:
        for i in range(len(list_of_keys)):
            if i == 0:
                key = list_of_keys[i]
                final_dataset = leagues_dataset[key].copy(deep=True)
            else:
                key = list_of_keys[i]
                final_dataset = pd.concat([final_dataset, leagues_dataset[key]], axis=0)

        #Retries to load all the previous data that couldn't be loaded during extraction into the database
        for i in range(2): #Tries twice to load data in case of any unforeseen connection issue
            try:
                data_loader(today_df) #If try is successful, breaks the loop
                break
            except Exception as e:
                except_messgs[f"(Final Database Loading): {i}"] = f"{type(e).__name__}: {e}" #Catches and Records Error
                league_counter += 1
                continue #If try isn't successful but it's the first time, then it tries again
        

In [314]:
#Dictionary containing the urls of each league for the different sites extracted from.

leagues_list = {
    'English Premier League':['https://www.flashscore.nl/voetbal/engeland/premier-league/schema/', 'https://www.worldfootball.net/all_matches/eng-premier-league-2023-2024/', 'https://www.sofascore.com/tournament/football/england/premier-league/17#52186'],
    'Championship':['https://www.flashscore.nl/voetbal/engeland/championship/schema/','https://www.worldfootball.net/all_matches/eng-championship-2023-2024/'],
    'EFL Championship':['https://www.flashscore.nl/voetbal/engeland/efl-cup/schema/', ''],
    'EFL Trophy':['https://www.flashscore.nl/voetbal/engeland/efl-trophy/schema/', 'https://www.worldfootball.net/all_matches/eng-efl-trophy-2023-2024/'],
    'La Liga':['https://www.flashscore.nl/voetbal/spanje/laliga/schema/','https://www.worldfootball.net/all_matches/esp-primera-division-2023-2024/'],
    'La Liga2':['https://www.flashscore.nl/voetbal/spanje/laliga2/schema/', 'https://www.worldfootball.net/all_matches/esp-segunda-division-2023-2024/'],
    'Bundesliga':['https://www.flashscore.nl/voetbal/duitsland/bundesliga/schema/', 'https://www.worldfootball.net/all_matches/bundesliga-2023-2024/'],
    'Serie A':['https://www.flashscore.nl/voetbal/italie/serie-a/schema/', 'https://www.worldfootball.net/all_matches/ita-serie-a-2023-2024/'],
    'Serie B':['https://www.flashscore.nl/voetbal/italie/serie-b/schema/', 'https://www.worldfootball.net/all_matches/ita-serie-b-2023-2024/'],
    'Ligue 1':['https://www.flashscore.nl/voetbal/frankrijk/ligue-1/schema/', 'https://www.worldfootball.net/all_matches/fra-ligue-1-2023-2024/'],
    'Ligue 2':['https://www.flashscore.nl/voetbal/frankrijk/ligue-2/schema/', 'https://www.worldfootball.net/all_matches/fra-ligue-2-2023-2024/'],s
    'Brasileirão Serie A':['https://www.flashscore.nl/voetbal/brazilie/braziliaanse-competitie/schema/', 'https://www.worldfootball.net/all_matches/bra-serie-a-2023/'],
    'Primera División':['https://www.flashscore.nl/voetbal/argentinie/primera-d/schema/', ''],
    'Major League Soccer':['https://www.flashscore.nl/voetbal/usa/mls/schema/', 'https://www.worldfootball.net/all_matches/usa-major-league-soccer-2023/'],
    'Eredivisie':['https://www.flashscore.nl/eredivisie/schema/', 'https://www.worldfootball.net/all_matches/ned-eredivisie-2023-2024/'],
    'Primeira Liga':['https://www.flashscore.nl/voetbal/portugal/liga-portugal/schema/', 'https://www.worldfootball.net/all_matches/por-primeira-liga-2023-2024/'],
    'J1 League':['https://www.flashscore.nl/voetbal/japan/j1-league/schema/', 'https://www.worldfootball.net/all_matches/jpn-j1-league-2023/'],
    'Scottish Premiership':['https://www.flashscore.nl/voetbal/schotland/premiership/schema/', 'https://www.worldfootball.net/all_matches/sco-premiership-2023-2024/'],
    'Superliga':['https://www.flashscore.nl/voetbal/denemarken/superliga/schema/', 'https://www.worldfootball.net/all_matches/den-superligaen-2023-2024/'],
    'Süper Lig':['https://www.flashscore.nl/voetbal/turkije/super-lig/schema/', 'https://www.worldfootball.net/all_matches/tur-sueperlig-2023-2024/'],
    'Allsvenskan':['https://www.flashscore.nl/voetbal/zweden/allsvenskan/schema/', 'https://www.worldfootball.net/all_matches/swe-allsvenskan-2023/'],
    'Saudi Professional League':['https://www.flashscore.nl/voetbal/saoedi-arabie/premier-league/schema/', 'https://www.worldfootball.net/all_matches/ksa-saudi-pro-league-2023-2024/'],
    'Jupiler Pro League':['https://www.flashscore.nl/voetbal/belgie/jupiler-pro-league/schema/', 'https://www.worldfootball.net/all_matches/bel-eerste-klasse-a-2023-2024/'],
    'UEFA Champions League':['https://www.flashscore.nl/voetbal/europa/champions-league/schema/', 'https://www.worldfootball.net/all_matches/champions-league-2023-2024/'],
    'UEFA Europa League':['https://www.flashscore.nl/voetbal/europa/europa-league/schema/', 'https://www.worldfootball.net/all_matches/europa-league-2023-2024/'],
    'UEFA Europa Conference League':['https://www.flashscore.nl/voetbal/europa/europa-conference-league/schema/', 'https://www.worldfootball.net/all_matches/europa-conference-league-2023-2024/'],
}

In [None]:
leagues_abbrv = {
    'English Premier League':'PL','Championship':'CHA',
    'EFL Championship':'EFL','EFL Trophy':'TRO',
    'La Liga':'LL','La Liga2':'LL2',
    'Bundesliga':'BUN','Serie A':'SA',
    'Serie B':'SB','Ligue 1':'L1',
    'Ligue 2':'L2','Brasileirão Serie A':'SA',
    'Primera División':'PD','Major League Soccer':'MLS',
    'Eredivisie':'ERE','Primeira Liga':'LP',
    'J1 League':'J1','Scottish Premiership':'PRE',
    'Superliga':'SL','Süper Lig':'SL',
    'Allsvenskan':'ALL','Saudi Professional League':'SPL',
    'Jupiler Pro League':'JL','UEFA Champions League': 'CL',
    'UEFA Europa League':'EL','UEFA Europa Conference League':'ECL'
}

In [2]:
testleagues_list = {
    #'English Premier League':['https://www.flashscore.nl/voetbal/engeland/premier-league/schema/', 'https://www.worldfootball.net/all_matches/eng-premier-league-2023-2024/', 'https://www.sofascore.com/tournament/football/england/premier-league/17#52186'],
    #'EFL Championship':['https://www.flashscore.nl/voetbal/engeland/efl-cup/schema/', ''],
    #'EFL Trophy':['https://www.flashscore.nl/voetbal/engeland/efl-trophy/schema/', 'https://www.worldfootball.net/all_matches/eng-efl-trophy-2023-2024/'],
    'Ligue 1':['https://www.flashscore.nl/voetbal/frankrijk/ligue-1/schema/', 'https://www.worldfootball.net/all_matches/fra-ligue-1-2023-2024/'],
    #'La Liga':['https://www.flashscore.nl/voetbal/spanje/laliga/schema/','https://www.worldfootball.net/all_matches/esp-primera-division-2023-2024/'],
    #'La Liga2':['https://www.flashscore.nl/voetbal/spanje/laliga2/schema/', 'https://www.worldfootball.net/all_matches/esp-segunda-division-2023-2024/']
    #'Serie B':['https://www.flashscore.nl/voetbal/italie/serie-b/schema/', 'https://www.worldfootball.net/all_matches/ita-serie-b-2023-2024/'],
    #'Major League Soccer':['https://www.flashscore.nl/voetbal/usa/mls/schema/', 'https://www.worldfootball.net/all_matches/usa-major-league-soccer-2023/'],
    'Bundesliga':['https://www.flashscore.nl/voetbal/duitsland/bundesliga/schema/', 'https://www.worldfootball.net/all_matches/bundesliga-2023-2024/'], 
}

In [15]:
today = date.today()
tomorrow = date.today() + timedelta(days=1)
#print((today.day % 2), (tomorrow.day %2))

In [47]:
yesterday = date.today() + timedelta(days=-1)
today = date.today()
tomorrow = date.today() + timedelta(days=1)
print(yesterday, today, tomorrow)

2023-09-29 2023-09-30 2023-10-01


In [4]:
print(today, tomorrow)

2023-09-20 2023-09-22


In [318]:
%%time
except_messgs = {}
match_extraction(testleagues_list, tomorrow)

CPU times: total: 1.88 s
Wall time: 3min 52s


In [None]:
except_messgs

In [None]:
#Write the script to first check if a team in a particular match has already been gotten before and simply extract from the database if extracted in the past
#In the analysis, also check team performance specifically for a given league.
    #Hence link entries to given leagues played.


#Catch Exception messages which will be sent to the devs, email.
#Convert the Analysis pipeline to an app

In [None]:
#Converting the data extracted to a DataFrame for analysis
df = pd.DataFrame(rows, columns=['date', 'hometeam', 'awayteam', 'match_urls', 'home_urls', 'away_urls', 'league', 'home_team_matches', 'away_team_matches', 'head2head_matches', 'home_team_matchespattern', 'away_team_matchespattern'])
df

---

### Referee Details Extraction

In [5]:
def refdata_loader(dataset):
    #Extracting the data from the dataframe to load into the database multiple rows at a time
    lim = dataset.shape[0]

    ref_data = []
    for i in range(lim):
        ref_data.append(dataset.iloc[i,:])

    #PostgreSQL database connection parameters
    connection_params = {
        "host": "ec2-34-251-233-253.eu-west-1.compute.amazonaws.com",
        "port": "5432",
        "database": "d6smqg5adrljks",
        "user": "tzjbmiqqkxkuxu",
        "password": "1630e53a904f24ef0d85affbd499bd6f3cbbc1c43c47ca8d4180ba8841ee7676"
    }

    #Connect to PostgreSQL
    connection = psycopg2.connect(**connection_params)
    cursor = connection.cursor()

    create_query = '''CREATE TABLE IF NOT EXISTS ref_historic_match (
        date VARCHAR,
        time VARCHAR,
        hometeam VARCHAR,
        awayteam VARCHAR,
        result VARCHAR,
        matchlink VARCHAR,
        league VARCHAR,
        refereelink VARCHAR,
        referee_matchistlink JSONB,
        referee_matchhistdetails JSONB
    );'''
    cursor.execute(create_query)

    #Insert all the data into the table multiple rows at a time
    insert_query = "INSERT INTO ref_historic_match (date, time, hometeam, awayteam, result, matchlink, league, refereelink, referee_matchistlink, referee_matchhistdetails) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"
    cursor.executemany(insert_query, ref_data)

    #Commit and close connection
    connection.commit()
    cursor.close()
    connection.close()

In [6]:
def refreehist_extraction(leagues_list, today_date):
    leagues_dataset = {} #Created the empyt dictionary that will be used to concatenate all table from all leagues

    for key in list(leagues_list.keys()): #Loops through all the leagues in our list of league url
        league_url = leagues_list[key][1]

        league_counter = 0
        try:
            if league_url == '': 
                #Checks for empty league links and skips
                continue
            else:
                #Gets the contents from the leagues schedule page
                response = requests.get(league_url)
                time.sleep(2)
                soup = BeautifulSoup(response.content, "html.parser")

                #Extracts the table with the match schedules
                table = soup.find("table", class_="standard_tabelle")

                data = []
                match_links = []

                #Extracts the table rows
                table_rows = table.find_all("tr")
                for row in table_rows:
                    #Loops through al the rows and extracts the data in each columns
                    columns = row.find_all("td")
                    row_data = [column.get_text(strip=True) for column in columns]
                    data.append(row_data)

                    #Extracts the urls of each match
                    match_link = row.find("a", href=lambda href: href and "report" in href)
                    if match_link:
                        match_links.append(match_link["href"])
                    else:
                        match_links.append('')

                #Drops the empty entries
                zipped_lists = zip(data, match_links)
                zipped_lists = [pair for pair in zipped_lists if len(pair[0]) > 0]
                data, match_links = zip(*zipped_lists)

                columns = ['Date', 'Time', 'Home Team', 'Score', 'Away Team', 'Result', 'Links']
                df = pd.DataFrame(data, columns=columns)
                df.Links = match_links

                df = df[df['Links'] != ''] #Drops columns with empty url

                #Add the prefix to the column
                df['Links'] = 'https://www.worldfootball.net' + df['Links']
                df['Date'] = df['Date'].replace('', np.nan).ffill()
                df['Date'] = pd.to_datetime(df['Date'],  format='%d/%m/%Y')
                #today = date.today()

                #Filter rows with today's date
                today_df = df[df['Date'].dt.date == today_date] #Account for when the dataset filter everything due to no matching date
                today_df = today_df.copy(deep=True)
                curr_league = [key for i in range(len(today_df['Links']))]
                today_df['League'] = curr_league
                #print(list(today_df['Links']))

                #Extracts the link to the profile of the officiating referee from the match page using match url
                referee_urls = []
                for match_url in today_df.Links:
                    response = requests.get(match_url)
                    html_content = response.content
                    soup = BeautifulSoup(html_content, "html.parser")
                    referee_links = soup.find_all("a", href=lambda href: href and "referee_summary" in href)
                    ref_link = []
                    if len(referee_links) > 0:
                        for link in referee_links:
                            link_url = link.get("href")
                            ref_link.append(f'https://www.worldfootball.net{link_url}')
                            
                        ref_link = ref_link[0]
                        referee_urls.append(ref_link)
                    else:
                        referee_urls.append('')

                #Add the url of the profile of the officiating referee of each match to dataframe containing daily matches
                today_df['Referee_Links'] = referee_urls
                #print(list(today_df['Referee_Links']))

                #Extracts the url of the most recent matches officiated by the officiating referee
                ref_matchhist_url = []
                for ref_url in today_df.Referee_Links:
                    if ref_url != '':
                        ref_matchhist = []

                        response = requests.get(ref_url)
                        html_content = response.content

                        soup = BeautifulSoup(html_content, "html.parser")

                        table = soup.find("table")
                        rows = table.find_all("tr")

                        for row in rows:
                            columns = row.find_all("td")
                            for column in columns:
                                (column.get_text())

                        referee_summary_links = soup.find_all("a", href=lambda href: href and "referee_summary" and "2023-2024" and "2022-2023" in href)

                        for link in referee_summary_links:
                            link_url = link.get("href")
                            ref_matchhist.append(f'https://www.worldfootball.net{link_url}')
                        ref_matchhist = [link for link in ref_matchhist if 'referee_summary' in link]
                        
                        ref_matchhist = {'1':ref_matchhist}
                        ref_matchhist = json.dumps(ref_matchhist)
                        ref_matchhist_url.append(ref_matchhist)
                    else:
                        ref_matchhist_url.append(json.dumps({'1':[]}))

                #Add the urls of the most recent matches officiated by the referee to the dataframe of daily matches
                today_df['Referee_MatchHist_Links'] = ref_matchhist_url
                #print(list(today_df['Referee_MatchHist_Links']))

                #Extracts the details from each match and stores in a dictionary
                ref_matchhist_detail = []
                for row in today_df.Referee_MatchHist_Links:
                    transf_row = json.loads(row)
                    if transf_row['1'] != []:
                        data_dict = {'Date':[], 'Home Team':[], 'Away Team':[], 'Score':[], 'Yellow Cards':[], 'Unkown Card':[], 'Red Cards':[]}
                        for ref_match_url in transf_row['1']:
                            response = requests.get(ref_match_url)
                            soup = BeautifulSoup(response.content, "html.parser")

                            table = soup.find("table", class_="standard_tabelle")

                            data = [] # List to store table data

                            table_rows = table.find_all("tr")
                            for row in table_rows:
                                columns = row.find_all("td")
                                row_data = [column.get_text(strip=True) for column in columns]
                                data.append(row_data)

                            data = data[1:]
                            for entry in data:
                                entry.pop(2)

                            for entry in data:
                                for i in range(len(entry)):
                                    keys = list(data_dict.keys())[i]
                                    data_dict[keys].append(entry[i])
                        data_dict = json.dumps(data_dict)
                        ref_matchhist_detail.append(data_dict)
                    else:
                        ref_matchhist_detail.append(json.dumps({}))

                #Extracted match details are added to the dataframe of daily matches
                today_df['Referee_MatchHist_Details'] = ref_matchhist_detail
                today_df.drop('Score', axis=1, inplace=True)
                #print(list(today_df['Referee_MatchHist_Details']))
                
                #Loads the extracted league to the database
                for i in range(2): #Tries twice to load data in case of any unforeseen connection issue
                    try:
                        refdata_loader(today_df) #If try is successful, breaks the loop
                        #print("All daily matches of {} have been loaded!".format(key))
                        break
                    except Exception as e:
                        except_messgs[str(key)+f": {league_counter} (Database Loading)"] = f"{type(e).__name__}: {e}" #Catches and Records Error
                        if i < 1: #If try isn't successful but it's the first time, then it tries again
                            continue
                        else: #If try isn't successful the second time, it adds the dataframe to the dictionary to try later.
                            leagues_dataset[key] = today_df
                        league_counter += 1

            #leagues_dataset[key] = today_df #Adds the dataframe of daily matches for a league to the dictionary of leagues
        except Exception as e:
            print('except')
            except_messgs[str(key)+f": {league_counter}"] = f"{type(e).__name__}: {e}" #Catches and Records Error
            league_counter += 1
            continue

    #All the dataframe of the daily matches for all the leagues extracted are concatenated vertically
    list_of_keys = list(leagues_dataset.keys())
    if len(list_of_keys) > 0:
        for i in range(len(list_of_keys)):
            if i == 0:
                key = list_of_keys[i]
                final_dataset = leagues_dataset[key].copy(deep=True)
            else:
                key = list_of_keys[i]
                final_dataset = pd.concat([final_dataset, leagues_dataset[key]], axis=0)
        #print(final_dataset)

        #Retries to load all the previous data that couldn't be loaded during extraction into the database
        for i in range(2): #Tries twice to load data in case of any unforeseen connection issue
            try:
                refdata_loader(today_df) #If try is successful, breaks the loop
                break
            except Exception as e:
                except_messgs[f"(Final Database Loading): {i} "] = f"{type(e).__name__}: {e}" #Catches and Records Error
                league_counter += 1
                continue #If try isn't successful but it's the first time, then it tries again

In [7]:
%%time
except_messgs = {}
refreehist_extraction(testleagues_list, tomorrow)

CPU times: total: 1.06 s
Wall time: 9.6 s


In [None]:
except_messgs

In [None]:
#Converting the data extracted to a DataFrame for analysis
df_ref = pd.DataFrame(rows, columns=['date', 'time', 'hometeam', 'awayteam', 'result', 'matchlink', 'league', 'refereelink', 'referee_matchistlink', 'referee_matchhistdetails'])
df_ref

---

In [875]:
match_df = df.copy(deep=True)

In [306]:
ref_df = df_ref.copy(deep=True)

In [877]:
match_df#.tail(3)

Unnamed: 0,date,hometeam,awayteam,match_urls,home_urls,away_urls,league,home_team_matches,away_team_matches,head2head_matches,home_team_matchespattern,away_team_matchespattern
0,2023-09-23 15:00:00,Crystal Palace,Fulham,https://www.flashscore.nl/wedstrijd/0xCBRsPc/#...,https://www.flashscore.nl/wedstrijd/0xCBRsPc/#...,https://www.flashscore.nl/wedstrijd/0xCBRsPc/#...,English Premier League,{},{},{},"{'1': {'team': {'var': [], 'date': ['03.09.202...","{'1': {'team': {'var': [], 'date': ['02.09.202..."
1,2023-09-23 15:00:00,Luton Town FC,Wolverhampton Wanderers,https://www.flashscore.nl/wedstrijd/Mi0KPLgA/#...,https://www.flashscore.nl/wedstrijd/Mi0KPLgA/#...,https://www.flashscore.nl/wedstrijd/Mi0KPLgA/#...,English Premier League,"{'date': ['16.09.23', '01.09.23', '29.08.23', ...","{'date': ['16.09.23', '03.09.23', '29.08.23', ...","{'date': ['02.08.23', '05.01.13', '03.03.07', ...","{'1': {'team': {'var': [], 'date': ['01.09.202...","{'1': {'team': {'var': [], 'date': ['03.09.202..."
2,2023-09-23 15:00:00,Manchester City,Nottingham Forest,https://www.flashscore.nl/wedstrijd/GSENOu9G/#...,https://www.flashscore.nl/wedstrijd/GSENOu9G/#...,https://www.flashscore.nl/wedstrijd/GSENOu9G/#...,English Premier League,"{'date': ['16.09.23', '02.09.23', '27.08.23', ...","{'date': ['02.09.23', '30.08.23', '26.08.23', ...","{'date': ['18.02.23', '31.08.22', '03.01.09', ...","{'1': {'team': {'var': [], 'date': ['02.09.202...","{'1': {'team': {'var': [], 'date': ['02.09.202..."
3,2023-09-23 17:30:00,Brentford,Everton,https://www.flashscore.nl/wedstrijd/Eowg909T/#...,https://www.flashscore.nl/wedstrijd/Eowg909T/#...,https://www.flashscore.nl/wedstrijd/Eowg909T/#...,English Premier League,"{'date': ['16.09.23', '02.09.23', '29.08.23', ...","{'date': ['17.09.23', '02.09.23', '30.08.23', ...","{'date': ['11.03.23', '27.08.22', '15.05.22', ...","{'1': {'team': {'var': [], 'date': ['02.09.202...","{'1': {'team': {'var': [], 'date': ['02.09.202..."
4,2023-09-23 20:00:00,Burnley,Manchester United,https://www.flashscore.nl/wedstrijd/YVJP3vXj/#...,https://www.flashscore.nl/wedstrijd/YVJP3vXj/#...,https://www.flashscore.nl/wedstrijd/YVJP3vXj/#...,English Premier League,"{'date': ['02.09.23', '30.08.23', '27.08.23', ...","{'date': ['16.09.23', '03.09.23', '26.08.23', ...","{'date': ['21.12.22', '08.02.22', '30.12.21', ...","{'1': {'team': {'var': [], 'date': ['02.09.202...","{'1': {'team': {'var': ['88'], 'date': ['03.09..."


In [499]:
ref_df.head(3)

Unnamed: 0,date,time,hometeam,awayteam,result,matchlink,league,refereelink,referee_matchistlink,referee_matchhistdetails
0,2023-09-02 00:00:00,12:30,Sheffield United,Everton FC,2:2 (2:1),https://www.worldfootball.net/report/premier-l...,English Premier League,https://www.worldfootball.net/referee_summary/...,{'1': ['https://www.worldfootball.net/referee_...,"{'Date': ['10/06/2022', '12/07/2022', '15/09/2..."
1,2023-09-02 00:00:00,15:00,Brentford FC,AFC Bournemouth,2:2 (1:1),https://www.worldfootball.net/report/premier-l...,English Premier League,https://www.worldfootball.net/referee_summary/...,{'1': ['https://www.worldfootball.net/referee_...,"{'Date': ['29/10/2022', '30/07/2022', '06/08/2..."
2,2023-09-02 00:00:00,15:00,Burnley FC,Tottenham Hotspur,2:5 (1:2),https://www.worldfootball.net/report/premier-l...,English Premier League,https://www.worldfootball.net/referee_summary/...,{'1': ['https://www.worldfootball.net/referee_...,"{'Date': ['21/09/2022', '24/09/2022', '27/10/2..."


### Historic Scores and Inner Match Details Analysis and Pattern Detection

In [6]:
def indiv_teamrole_analysis(team_matches, team_name, role, skip=False, by_league=None):
    '''This function filters the table by only matches played by the team under analysis where the said
    team played the role they will play in their upcoming match'''
    
    #team_matches = json.loads(team_matches)
    team_df = pd.DataFrame(team_matches)
    team_df['home_club_goal'] = team_df['home_club_goal'].astype(int)
    team_df['away_club_goal'] = team_df['away_club_goal'].astype(int)
        
    #Creating the first part of the final string to describe the prediction depending on which set of historical matches (home, away of head-to-head)
    list_of_clubs = list(team_df['home_club']) + list(team_df['away_club'])
    num_of_club = len(set(list_of_clubs))
    if num_of_club > 2:
        end_string = 'Score prediction for {} team using {} historic match scores, for only matches where {} team played {} role: '.format(role, role, role, role)
    else:
        end_string = 'Score prediction for both teams using head-to-head historic match scores, for only matches where they played the same role as their upcoming game: '
    
    if by_league != None:
        team_df = team_df[team_df['league'] == by_league[1]]
        
        if num_of_club > 2:
            end_string = 'Score prediction for {} team using {} historic match scores filtered by {}, for only matches where {} team played {} role: '.format(role, role, by_league[0], role, role)
        else:
            end_string = 'Score prediction for both team using head-to-head historic match scores filtered by {}, for only matches where they played the same role as their upcoming game: '.format(by_league[0])
    
    if skip == True:
        for i in range(team_df.shape[0]):
            if (i%2) == 0:
                team_df.drop(i, inplace=True)
                
        if num_of_club > 2:
            end_string = 'Score prediction for {} team using {} historic match scores after skipping rows, for only matches where {} team played {} role: '.format(role, role, role, role)
        else:
            end_string = 'Score prediction for both team using head-to-head historic match scores after skipping rows, for only matches where they played the same role as their upcoming game: '
    
    if role == 'home':
        if len(list(team_df[team_df['home_club']==team_name])) > 2:
            third_recentscore = [list(team_df[team_df['home_club']==team_name]['home_club_goal'])[2], list(team_df[team_df['away_club']!=team_name]['away_club_goal'])[2]]
        else:
            third_recentscore = []
    else:
        if len(list(team_df[team_df['away_club']==team_name])) > 2:
            third_recentscore = [list(team_df[team_df['home_club']!=team_name]['home_club_goal'])[2], list(team_df[team_df['away_club']==team_name]['away_club_goal'])[2]]
        else:
            third_recentscore = []
            
    def check(dataframe, column1, column2):
        varb = list((team_df[team_df[column1] == team_name])[column2])[:3] #Filter by a given team in a given role
        if len(varb) <= 1: #Checks if there's enough data to check for a pattern
            return ['-']
        else:
            #Checks for all the patterns for a particular role regardless of which team played
            if (varb[0] == varb[1]):
                return [varb[0]]
            elif ((max(varb[:2]) - min(varb[:2])) == 1):
                if 0 in varb[:2]:
                    return [max(varb[:2])+1]
                else:
                    return [max(varb[:2])+1, min(varb[:2])-1]
            elif ((max(varb[:2]) - min(varb[:2])) == 2):
                return [max(varb[:2])-1]
            else:
                return ['-']
    
    scores = []
    if role == 'home':
        scores.append(check(team_df, 'home_club', 'home_club_goal'))
        scores.append(check(team_df, 'home_club', 'away_club_goal'))
    else:
        scores.append(check(team_df, 'away_club', 'home_club_goal'))
        scores.append(check(team_df, 'away_club', 'away_club_goal'))
        
    combined_scores = []
    final_output = []
    if ['-'] not in scores:
        #Gets all the possible predictions from the observed patterns if any
        for score1 in scores[0]:
            for score2 in scores[1]:
                combined_scores.append([score1, score2])
                
        #Checks and removes the third most recent match outcome if it already exists in the list of possible predictions
        if third_recentscore in combined_scores:
            pass
        else:
            for score in combined_scores:
                final_string = end_string + '{} - {}'.format(score[0], score[1])
                final_output.append(final_string)            
    else:
        pass
    return final_output

In [7]:
def indiv_role_analysis(team_matches, role, skip=False):
    '''This function analyses the table without any filters and simple checks for pattern for a given
    role (home or away). The historic matches of the team to play the given role in the upcoming match is
    analysed for the given role'''
    
    #team_matches = json.loads(team_matches)
    team_df = pd.DataFrame(team_matches)
    team_df['home_club_goal'] = team_df['home_club_goal'].astype(int)
    team_df['away_club_goal'] = team_df['away_club_goal'].astype(int)
    
    #Creating the first part of the final string to describe the prediction depending on which set of historical matches (home, away of head-to-head)
    list_of_clubs = list(team_df['home_club']) + list(team_df['away_club'])
    num_of_club = len(set(list_of_clubs))
    if num_of_club > 2:
        end_string = 'Score prediction for {} role using {} historic match scores, regardless of role played by {} team: '.format(role, role, role)
    else:
        end_string = 'Score prediction for both roles using head-to-head historic match scores, regardless of role played by both teams: '
    
    if by_league != None:
        team_df = team_df[team_df['league'] == by_league[1]]
        
        if num_of_club > 2:
            end_string = 'Score prediction for {} role using {} historic match scores filtered by {}, regardless of role played by {} team: '.format(role, role, by_league[0], role)
        else:
            end_string = 'Score prediction for both roles using head-to-head historic match scores filtered by {}, regardless of role played by both teams: '.format(by_league[0])
    
    if skip == True:
        for i in range(team_df.shape[0]):
            if (i%2) == 0:
                team_df.drop(i, inplace=True)
                
        if num_of_club > 2:
            end_string = 'Score prediction for {} role using {} historic match scores after skipping rows, regardless of role played by {} team: '.format(role, role, role)
        else:
            end_string = 'Score prediction for both roles using head-to-head historic match scores after skipping rows, regardless of role played by both teams: '

    
    third_recentscore = [list(team_df['home_club_goal'])[2], list(team_df['away_club_goal'])[2]]
    
    def check(dataframe, column):
        varb = list(dataframe[column])[:3]
        if len(varb) <= 1: #Checks if there's enough data to check for a pattern
            return ['-']
        else:
            #Checks for all the patterns for a particular role regardless of which team played
            if (varb[0] == varb[1]):
                return [varb[0]]
            elif ((max(varb[:2]) - min(varb[:2])) == 1):
                if 0 in varb[:2]:
                    return [max(varb[:2])+1]
                else:
                    return [max(varb[:2])+1, min(varb[:2])-1]
            elif ((max(varb[:2]) - min(varb[:2])) == 2):
                return [max(varb[:2])-1]
            else:
                return ['-']
    
    scores = []    
    scores.append(check(team_df, 'home_club_goal'))
    scores.append(check(team_df, 'away_club_goal'))
    
    combined_scores = []
    final_output = []
    if ['-'] not in scores:
        #Gets all the possible predictions from the observed patterns if any
        for score1 in scores[0]:
            for score2 in scores[1]:
                combined_scores.append([score1, score2])
                
        #Checks and removes the third most recent match outcome if it already exists in the list of possible predictions
        if third_recentscore in combined_scores:
            pass
        else:
            for score in combined_scores:
                final_string = end_string + '{} - {}'.format(score[0], score[1])
                final_output.append(final_string)
    else:
        pass
    return final_output

In [8]:
def indiv_team_analysis(team_matches, team_name, role, skip=False):
    '''This function analyses a given team regardless of which role they played in their historic matches.
    It simply checks how they performed against any opponent and if there are any underlying patterns'''
    
    #team_matches = json.loads(team_matches)
    team_df = pd.DataFrame(team_matches)
    team_df['home_club_goal'] = team_df['home_club_goal'].astype(int)
    team_df['away_club_goal'] = team_df['away_club_goal'].astype(int)
    
    #Creating the first part of the final string to describe the prediction depending on which set of historical matches (home, away of head-to-head)
    list_of_clubs = list(team_df['home_club']) + list(team_df['away_club'])
    num_of_club = len(set(list_of_clubs))
    if num_of_club > 2:
        end_string = 'Score prediction for {} team using {} historic match scores, regardless of role played by {} team: '.format(role, role, role)
    else:
        end_string = 'Score prediction for both teams using head-to-head historic match scores, regardless of role played by both teams: '
    
    if skip == True:
        for i in range(team_df.shape[0]):
            if (i%2) == 0:
                team_df.drop(i, inplace=True)
        
        if num_of_club > 2:
            end_string = 'Score prediction for {} team using {} historic match scores after skipping rows, regardless of role played by {} team: '.format(role, role, role)
        else:
            end_string = 'Score prediction for both teams using head-to-head historic match scores after skipping rows, regardless of role played by both teams: '
        
    
    rearranged = []
    #Rearrangement of the historic table to put the team under analysis in one side and all opponent on the other side.
    for i in range(team_df.shape[0]):
        if list(team_df.iloc[i,:])[2] == team_name:
            rearranged.append(list(team_df.iloc[i,:]))
        else:
            temp_list = list(team_df.iloc[i,:])
            temp_var = temp_list[2]
            temp_list[2] = temp_list[3]
            temp_list[3] = temp_var
            temp_var = temp_list[4]
            temp_list[4] = temp_list[5]
            temp_list[5] = temp_var
            rearranged.append(temp_list)
    new_team_df = pd.DataFrame(rearranged, columns=['date','league','team','opponent','team_score','opponent_score'])
    
    if role == 'home':
        third_recentscore = [list(new_team_df['team_score'])[2], list(new_team_df['opponent_score'])[2]]
    else:
        third_recentscore = [list(new_team_df['opponent_score'])[2], list(new_team_df['team_score'])[2]]
    
    def check(dataframe, column):
        varb = list(dataframe[column])[:3]
        if len(varb) <= 1: #Checks if there's enough data to check for a pattern
            return ['-']
        else:
            #Checks for all the patterns for a particular role regardless of which team played
            if (varb[0] == varb[1]):
                return [varb[0]]
            elif ((max(varb[:2]) - min(varb[:2])) == 1):
                if 0 in varb[:2]:
                    return [max(varb[:2])+1]
                else:
                    return [max(varb[:2])+1, min(varb[:2])-1]
            elif ((max(varb[:2]) - min(varb[:2])) == 2):
                return [max(varb[:2])-1]
            else:
                return ['-']
            
    scores = []    
    scores.append(check(new_team_df, 'team_score'))
    scores.append(check(new_team_df, 'opponent_score'))
    
    combined_scores = []
    final_output = []
    if ['-'] not in scores:
        if role == 'home':
            for score1 in scores[0]:
                for score2 in scores[1]:
                    combined_scores.append([score1, score2])
        else:
            for score1 in scores[1]:
                for score2 in scores[0]:
                    combined_scores.append([score1, score2])
                    
        #Checks and removes the third most recent match outcome if it already exists in the list of possible predictions
        if third_recentscore in combined_scores:
            pass
        else:
            for score in combined_scores:
                final_string = end_string + '{} - {}'.format(score[0], score[1])
                final_output.append(final_string)
    else:
        pass
    return final_output

In [43]:
def windrawloss_analysis(team_matches, team_name, role, skip=False):
    '''This function analyses the scores of the historic match regardless of team or role and regardless
    of orientation of the scores. Hence if simply gets the match scores and reorders them in descending order
    and check if there's a pattern (repetition). It also check for match outcome pattern (win, loss, draw),
    and if the team under analysis played different roles during this observed pattern'''
    
    #team_matches = json.loads(team_matches)
    team_df = pd.DataFrame(team_matches)
    team_df['home_club_goal'] = team_df['home_club_goal'].astype(int)
    team_df['away_club_goal'] = team_df['away_club_goal'].astype(int)
    
    #Creating the first part of the final string to describe the prediction depending on which set of historical matches (home, away of head-to-head)
    list_of_clubs = list(team_df['home_club']) + list(team_df['away_club'])
    num_of_club = len(set(list_of_clubs))
    if num_of_club > 2:
        end_string = 'Score prediction for {} team using {} historic match scores, based on win-loss-draw pattern: '.format(role, role)
    else:
        end_string = 'Score prediction for both teams using head-to-head historic match scores, based on win-loss-draw pattern: '
    
    if skip == True:
        for i in range(team_df.shape[0]):
            if (i%2) == 0:
                team_df.drop(i, inplace=True)
                
        if num_of_club > 2:
            end_string = 'Score prediction for {} team using {} historic match scores after skipping rows, based on win-loss-draw pattern: '.format(role, role)
        else:
            end_string = 'Score prediction for both teams using head-to-head historic match scores after skipping rows, based on win-loss-draw pattern: '
        
    
    overall_cond = [] #The list to compile and check the presence of all three conditions.
    #Check if there is a repeat in score, regardless of team playing a given role or the orientation (home before away)
    scores = []
    for i in range(3):
        temp_scorelist = [list(team_df['home_club_goal'])[i], list(team_df['away_club_goal'])[i]]
        scores.append([max(temp_scorelist), min(temp_scorelist)])
    
    if (scores[0] == scores[1]) | ((scores[0][0] == scores[1][0]) & (abs(scores[0][1] - scores[1][1]) == 1)):
        if scores[2] != scores[0]:
            overall_cond.append('true')
        else:
            overall_cond.append('false')
    else:
        overall_cond.append('false')
        
    #Checks if there roles played by the team were different for the last two games
    roles = [] 
    for i in range(2):
        temp_rolelist = [list(team_df['home_club'])[i], list(team_df['away_club'])[i]]
        roles.append(temp_rolelist.index(team_name))
        
    if len(set(roles)) == 2:
        overall_cond.append('true')
    else:
        overall_cond.append('false')
    
    
    #Checks if the outcome (win, loss or draw) was different for the last two games
    rearranged = []
    #Rearrangement
    for i in range(team_df.shape[0]):
        if list(team_df.iloc[i,:])[2] == team_name:
            rearranged.append(list(team_df.iloc[i,:]))
        else:
            temp_list = list(team_df.iloc[i,:])
            temp_var = temp_list[2]
            temp_list[2] = temp_list[3]
            temp_list[3] = temp_var
            temp_var = temp_list[4]
            temp_list[4] = temp_list[5]
            temp_list[5] = temp_var
            rearranged.append(temp_list)
    new_team_df = pd.DataFrame(rearranged, columns=['date','league','team','opponent','team_score','opponent_score'])
    
    scores = []
    for i in range(3):
        temp_scorelist = [list(new_team_df['team_score'])[i], list(new_team_df['opponent_score'])[i]]
        #print(temp_scorelist)
        scores.append(temp_scorelist)
        
    individ_scores = []
    for score in scores[:-1]:
        #print(score)
        for individ_score in score:
            individ_scores.append(individ_score)
        
    possible_outcomes = ['win', 'loss', 'draw']
    
    outcomes = []
    for score in scores:
        if score[0] == score[1]:
            outcomes.append('draw')
        elif score[0] > score[1]:
            outcomes.append('win')
        else:
            outcomes.append('loss')
    
    if outcomes[2] in [outcomes[0], outcomes[1]]:
        if outcomes[0] != outcomes[1]:
            overall_cond.append('true')
        else:
            overall_cond.append('false')
    else:
        overall_cond.append('false')
        
    #Compile the final prediction based on whether the three conditions checked for are all present.
    final_output = []
    if 'false' in overall_cond:
        pass
    else:
        prediction = [outcome for outcome in possible_outcomes if outcome not in outcomes]
        if prediction[0] == 'draw':
            elements_counts = Counter(individ_scores)
            #print(elements_counts, list(elements_counts.keys())[0], elements_counts.most_common()[0][0])
            final_string = end_string + '{}, {} - {}'.format(prediction[0], elements_counts.most_common()[0][0], elements_counts.most_common()[0][0])
            final_output.append(final_string)
        else:
            elements_counts = Counter(individ_scores)
            #print(elements_counts)
            final_string = end_string + '{}, {} - {}'.format('win/loss', elements_counts.most_common()[0][0], elements_counts.most_common()[1][0])
            final_output.append(final_string)
    return final_output

In [None]:
def inner_detail_analyser(given_list):
    '''This function takes in the inner match details of the historical matchs of a given team and analyses
    it to extracts certain details to help with prediction.'''
    list_of_analysis = []
    for item in given_list:
        item_dict =json.loads(item)
        inner_detail = {'first_ten_minutes':[], 'last_ten_minutes':[], 'first_half':[],
                        'second_half':[], 'injury_time':[], 'avg_first_ten_minutes':[],
                        'avg_last_ten_minutes':[], 'avg_first_half':[], 'avg_second_half':[],
                        'avg_injury_time':[], 'match_firstgoal':[], 'match_lastgoal':[],
                        'underdog_effect':[]
                       }

        for key in list(item_dict.keys()):
            if item_dict[key] != {}:
                teamgoal_time = item_dict[key]['team']['goal']
                opponentgoal_time = item_dict[key]['opponent']['goal']
                count_firstten = 0
                count_lastten = 0
                count_firsthalf = 0
                count_secondhalf = 0
                count_injurytime = 0

                #Checks which of the goals scored at certain times during the match fit which category
                for minute in teamgoal_time:
                    if int(minute) < 10:
                        inner_detail['first_ten_minutes'].append(minute)
                        count_firstten += 1
                    if int(minute) < 49:
                        inner_detail['first_half'].append(minute)
                        count_firsthalf += 1
                    if int(minute) > 49:
                        inner_detail['second_half'].append(minute)
                        count_secondhalf += 1
                    if int(minute) > 80:
                        inner_detail['last_ten_minutes'].append(minute)
                        count_lastten += 1
                    if (int(minute) > 45) & (int(minute) < 50) | (int(minute) > 90):
                        inner_detail['injury_time'].append(minute)
                        count_injurytime += 1

                #Collates the number of these phenomena per match to later calculate the average
                inner_detail['avg_first_ten_minutes'].append(count_firstten)
                inner_detail['avg_last_ten_minutes'].append(count_lastten)
                inner_detail['avg_first_half'].append(count_firsthalf)
                inner_detail['avg_second_half'].append(count_secondhalf)
                inner_detail['avg_injury_time'].append(count_injurytime)

                team_goal = [int(minute) for minute in teamgoal_time]
                opponent_goal = [int(minute) for minute in opponentgoal_time]

                if (len(team_goal) > 0) & (len(opponent_goal)):
                    #Checks if the team scored the first goal
                    if min(team_goal) < min(opponent_goal):
                        inner_detail['match_firstgoal'].append('1')

                    #Checks if the team scored the last goal
                    if max(team_goal) > max(opponent_goal):
                        inner_detail['match_lastgoal'].append('1')

                    #Checks if the team conceded the first goal but still won the match
                    if (min(team_goal) > min(opponent_goal)) & (len(team_goal) > len(opponent_goal)):
                        inner_detail['underdog_effect'].append('1')  

        inner_detailanalysis = {}
        first_list = ['first_ten_minutes', 'last_ten_minutes', 'first_half', 'second_half', 'injury_time', 
                     'match_firstgoal', 'match_lastgoal', 'underdog_effect', ]
        second_list = ['avg_first_ten_minutes', 'avg_last_ten_minutes', 'avg_first_half', 'avg_second_half', 
                      'avg_injury_time', ]
        for key in inner_detail.keys():
            if key in first_list:
                inner_detailanalysis[key] = len(inner_detail[key])
            if key in second_list:
                inner_detailanalysis[key] = sum(inner_detail[key])/len(inner_detail[key])

        #print(inner_detail)
        list_of_analysis.append(inner_detailanalysis)
    list_of_analysis = pd.DataFrame(list_of_analysis)
    return list_of_analysis

In [11]:
def final_innerdet(innerdetail_df):
    '''This function compares the inner match details of both teams to play, and extracts the dominant team
    for each category.'''
    final_df = {}
    for column in list(innerdetail_df.columns):
        if (max(list(innerdetail_df[column])) == list(innerdetail_df[column])[0]) & (max(list(innerdetail_df[column])) == list(innerdetail_df[column])[1]):
            final_df[column] = ['both teams']
        elif (max(list(innerdetail_df[column])) == list(innerdetail_df[column])[0]):
            final_df[column] = ['home team']
        else:
            final_df[column] = ['away team']
    
    return final_df

In [12]:
def matchscore_total_analysis(dataset):
    '''This function takes in an entire row of the dataset pulled from the database and 
    extracts all the observed pattern from the historic match scores, as well as al the details
    from the inner match details of the historic matches for the home and awar team and their
    head-to-head meatches.'''
    diction = {'7':[1,'home','home_score_patterns'], '8':[2,'away','away_score_patterns'], '9':[1,'home','h2h_score_patterns']}
    columns_list = [7,8,9]
    dict_of_patterns = {'home_score_patterns':[], 'away_score_patterns':[], 'h2h_score_patterns':[],
                        'innerdetail_analysis':[]}
    for i in range(dataset.shape[0]):
        row = list(dataset.iloc[i,:])
        for number in columns_list:
            dict_of_pattern = {}
            list_of_pattern = []
            
            #This executes all the analysis functions for pattern finding (skip and no skip) to extract predictions
            try:
                patterns = indiv_teamrole_analysis(row[number], row[diction[str(number)][0]], diction[str(number)][1])
                list_of_pattern = list_of_pattern + [pattern for pattern in patterns]
            except:
                list_of_pattern = list_of_pattern + []
            try:
                patterns = indiv_teamrole_analysis(row[number], row[diction[str(number)][0]], diction[str(number)][1], skip=True)
                list_of_pattern = list_of_pattern + [pattern for pattern in patterns]
            except:
                list_of_pattern = list_of_pattern + []
            try:
                patterns = indiv_teamrole_analysis(row[number], row[diction[str(number)][0]], diction[str(number)][1], by_league=[row[6],leagues_abbrv[row[6]]])
                list_of_pattern = list_of_pattern + [pattern for pattern in patterns]
            except:
                list_of_pattern = list_of_pattern + []
            try:
                patterns = indiv_role_analysis(row[number], diction[str(number)][1])
                list_of_pattern = list_of_pattern + [pattern for pattern in patterns]
            except:
                list_of_pattern = list_of_pattern + []
            try:
                patterns = indiv_role_analysis(row[number], diction[str(number)][1], skip=True)
                list_of_pattern = list_of_pattern + [pattern for pattern in patterns]
            except:
                list_of_pattern = list_of_pattern + []
            try:
                patterns = indiv_team_analysis(row[number], row[diction[str(number)][0]], diction[str(number)][1])
                list_of_pattern = list_of_pattern + [pattern for pattern in patterns]
            except:
                list_of_pattern = list_of_pattern + []
            try:
                patterns = indiv_team_analysis(row[number], row[diction[str(number)][0]], diction[str(number)][1], skip=True)
                list_of_pattern = list_of_pattern + [pattern for pattern in patterns]
            except:
                list_of_pattern = list_of_pattern + []
            try:
                patterns = windrawloss_analysis(row[number], row[diction[str(number)][0]], diction[str(number)][1])
                list_of_pattern = list_of_pattern + [pattern for pattern in patterns]
            except:
                list_of_pattern = list_of_pattern + []
            try:
                patterns = windrawloss_analysis(row[number], row[diction[str(number)][0]], diction[str(number)][1], skip=True)
                list_of_pattern = list_of_pattern + [pattern for pattern in patterns]
            except:
                list_of_pattern = list_of_pattern + []

            #Adds all the predictions for a given team history to one list to be alter added to the final dataset
            for i in range(len(list_of_pattern)):
                dict_of_pattern[str(i)] = list_of_pattern[i]

            dict_of_patterns[diction[str(number)][2]].append(json.dumps(dict_of_pattern))

        #Analyses the details of the historic matches of the teams to extract relevant information for client.
        try:
            innerdetail_df = inner_detail_analyser(row[-2:])
            innerdetail = final_innerdet(innerdetail_df)
            dict_of_patterns['innerdetail_analysis'].append(json.dumps(innerdetail))
        except:
            dict_of_patterns['innerdetail_analysis'].append(json.dumps({}))
    return dict_of_patterns

In [13]:
def filter_pred(dataset):
    '''This function filters the final dataset gotten from the analysis function to check for and retain
    only interesting predictions as specified by the client.'''
    
    index_list = list(dataset.index)
    limit = dataset.shape[0]
    dataset_2 = dataset.copy(deep=True)
    
    for i in range(limit):
        row = list(dataset.iloc[i,:])
        pattern_list = row[-4:-1]
        count_1 = 0
        count_2 = 0
        count_3 = 0
        for pattern_dict in pattern_list:
            pattern_dict = json.loads(pattern_dict)
            if pattern_dict != {}:
                for key in pattern_dict.keys():
                    score = pattern_dict[key][-5:]
                    score = score.replace(' ','')
                    score = score.split('-')
                    score = [int(elem) for elem in score]
                    if sum(score) > 2: #Checks the first condition to see if the patterns are interesting
                        count_2 += 1
                    if (score[0] > 1) & (score[1] > 1): #checks the second condition for interesting patterns only
                        count_3 += 1
                    count_1 += 1
                    
        #If enough interesting patterns are observed, the row is retained, else row is discarded.
        if count_1 > 3:
            if ((count_2/count_1) > 0.70) | ((count_3/count_1) > 0.70):
                pass
            else:
                dataset_2.drop(index_list[i], axis=0, inplace=True)
        elif (count_1 <= 3) & (count_1 > 0):
            if ((count_2/count_1) > 0.60) | ((count_3/count_1) > 0.60):
                pass
            else:
                dataset_2.drop(index_list[i], axis=0, inplace=True)
        else:
            dataset_2.drop(index_list[i], axis=0, inplace=True)
    return dataset_2              

---

In [892]:
additional_columns = matchscore_total_analysis(match_df)

modified_dataset = match_df.copy(deep=True)
for key in additional_columns.keys():
    modified_dataset[key] = additional_columns[key]

In [893]:
modified_dataset

Unnamed: 0,date,hometeam,awayteam,match_urls,home_urls,away_urls,league,home_team_matches,away_team_matches,head2head_matches,home_team_matchespattern,away_team_matchespattern,home_score_patterns,away_score_patterns,h2h_score_patterns,innerdetail_analysis
0,2023-09-23 15:00:00,Crystal Palace,Fulham,https://www.flashscore.nl/wedstrijd/0xCBRsPc/#...,https://www.flashscore.nl/wedstrijd/0xCBRsPc/#...,https://www.flashscore.nl/wedstrijd/0xCBRsPc/#...,English Premier League,{},{},{},"{'1': {'team': {'var': [], 'date': ['03.09.202...","{'1': {'team': {'var': [], 'date': ['02.09.202...",{},{},{},"{""first_ten_minutes"": [""both teams""], ""last_te..."
1,2023-09-23 15:00:00,Luton Town FC,Wolverhampton Wanderers,https://www.flashscore.nl/wedstrijd/Mi0KPLgA/#...,https://www.flashscore.nl/wedstrijd/Mi0KPLgA/#...,https://www.flashscore.nl/wedstrijd/Mi0KPLgA/#...,English Premier League,"{'date': ['16.09.23', '01.09.23', '29.08.23', ...","{'date': ['16.09.23', '03.09.23', '29.08.23', ...","{'date': ['02.08.23', '05.01.13', '03.03.07', ...","{'1': {'team': {'var': [], 'date': ['01.09.202...","{'1': {'team': {'var': [], 'date': ['03.09.202...","{""0"": ""Score prediction for home role using ho...","{""0"": ""Score prediction for away role using aw...","{""0"": ""Score prediction for both roles using h...","{""first_ten_minutes"": [""home team""], ""last_ten..."
2,2023-09-23 15:00:00,Manchester City,Nottingham Forest,https://www.flashscore.nl/wedstrijd/GSENOu9G/#...,https://www.flashscore.nl/wedstrijd/GSENOu9G/#...,https://www.flashscore.nl/wedstrijd/GSENOu9G/#...,English Premier League,"{'date': ['16.09.23', '02.09.23', '27.08.23', ...","{'date': ['02.09.23', '30.08.23', '26.08.23', ...","{'date': ['18.02.23', '31.08.22', '03.01.09', ...","{'1': {'team': {'var': [], 'date': ['02.09.202...","{'1': {'team': {'var': [], 'date': ['02.09.202...","{""0"": ""Score prediction for home team using ho...","{""0"": ""Score prediction for away role using aw...",{},"{""first_ten_minutes"": [""home team""], ""last_ten..."
3,2023-09-23 17:30:00,Brentford,Everton,https://www.flashscore.nl/wedstrijd/Eowg909T/#...,https://www.flashscore.nl/wedstrijd/Eowg909T/#...,https://www.flashscore.nl/wedstrijd/Eowg909T/#...,English Premier League,"{'date': ['16.09.23', '02.09.23', '29.08.23', ...","{'date': ['17.09.23', '02.09.23', '30.08.23', ...","{'date': ['11.03.23', '27.08.22', '15.05.22', ...","{'1': {'team': {'var': [], 'date': ['02.09.202...","{'1': {'team': {'var': [], 'date': ['02.09.202...","{""0"": ""Score prediction for home team using ho...","{""0"": ""Score prediction for away team using aw...","{""0"": ""Score prediction for both teams using h...","{""first_ten_minutes"": [""home team""], ""last_ten..."
4,2023-09-23 20:00:00,Burnley,Manchester United,https://www.flashscore.nl/wedstrijd/YVJP3vXj/#...,https://www.flashscore.nl/wedstrijd/YVJP3vXj/#...,https://www.flashscore.nl/wedstrijd/YVJP3vXj/#...,English Premier League,"{'date': ['02.09.23', '30.08.23', '27.08.23', ...","{'date': ['16.09.23', '03.09.23', '26.08.23', ...","{'date': ['21.12.22', '08.02.22', '30.12.21', ...","{'1': {'team': {'var': [], 'date': ['02.09.202...","{'1': {'team': {'var': ['88'], 'date': ['03.09...","{""0"": ""Score prediction for home team using ho...","{""0"": ""Score prediction for away team using aw...","{""0"": ""Score prediction for both teams using h...","{""first_ten_minutes"": [""home team""], ""last_ten..."


In [894]:
filtered_dataset = filter_pred(modified_dataset)

In [895]:
filtered_dataset

Unnamed: 0,date,hometeam,awayteam,match_urls,home_urls,away_urls,league,home_team_matches,away_team_matches,head2head_matches,home_team_matchespattern,away_team_matchespattern,home_score_patterns,away_score_patterns,h2h_score_patterns,innerdetail_analysis
4,2023-09-23 20:00:00,Burnley,Manchester United,https://www.flashscore.nl/wedstrijd/YVJP3vXj/#...,https://www.flashscore.nl/wedstrijd/YVJP3vXj/#...,https://www.flashscore.nl/wedstrijd/YVJP3vXj/#...,English Premier League,"{'date': ['02.09.23', '30.08.23', '27.08.23', ...","{'date': ['16.09.23', '03.09.23', '26.08.23', ...","{'date': ['21.12.22', '08.02.22', '30.12.21', ...","{'1': {'team': {'var': [], 'date': ['02.09.202...","{'1': {'team': {'var': ['88'], 'date': ['03.09...","{""0"": ""Score prediction for home team using ho...","{""0"": ""Score prediction for away team using aw...","{""0"": ""Score prediction for both teams using h...","{""first_ten_minutes"": [""home team""], ""last_ten..."


In [898]:
col_list = ['home_team_matches','away_team_matches','head2head_matches','home_team_matchespattern','away_team_matchespattern']
for column in col_list:
    filtered_dataset[column] = filtered_dataset[column].apply(json.dumps)
filtered_dataset

Unnamed: 0,date,hometeam,awayteam,match_urls,home_urls,away_urls,league,home_team_matches,away_team_matches,head2head_matches,home_team_matchespattern,away_team_matchespattern,home_score_patterns,away_score_patterns,h2h_score_patterns,innerdetail_analysis
4,2023-09-23 20:00:00,Burnley,Manchester United,https://www.flashscore.nl/wedstrijd/YVJP3vXj/#...,https://www.flashscore.nl/wedstrijd/YVJP3vXj/#...,https://www.flashscore.nl/wedstrijd/YVJP3vXj/#...,English Premier League,"{""date"": [""02.09.23"", ""30.08.23"", ""27.08.23"", ...","{""date"": [""16.09.23"", ""03.09.23"", ""26.08.23"", ...","{""date"": [""21.12.22"", ""08.02.22"", ""30.12.21"", ...","{""1"": {""team"": {""var"": [], ""date"": [""02.09.202...","{""1"": {""team"": {""var"": [""88""], ""date"": [""03.09...","{""0"": ""Score prediction for home team using ho...","{""0"": ""Score prediction for away team using aw...","{""0"": ""Score prediction for both teams using h...","{""first_ten_minutes"": [""home team""], ""last_ten..."


In [899]:
#Extracting the data from the dataframe to load into the database multiple rows at a time
lim = filtered_dataset.shape[0]

match_pred = []
for i in range(lim):
    match_pred.append(filtered_dataset.iloc[i,:])

In [901]:
#PostgreSQL database connection parameters
connection_params = {
    "host": "ec2-34-251-233-253.eu-west-1.compute.amazonaws.com",
    "port": "5432",
    "database": "d6smqg5adrljks",
    "user": "tzjbmiqqkxkuxu",
    "password": "1630e53a904f24ef0d85affbd499bd6f3cbbc1c43c47ca8d4180ba8841ee7676"
}

#Connect to PostgreSQL
connection = psycopg2.connect(**connection_params)
cursor = connection.cursor()

#Uncomment and run only when creating new table

#Meant to run only the first time (Drops the table if it already exists)
table_name = "match_prediction"
drop_query = f"DROP TABLE IF EXISTS {table_name}"
cursor.execute(drop_query)

#Create the table in the database
create_query = '''CREATE TABLE IF NOT EXISTS match_prediction (
    date VARCHAR,
    hometeam VARCHAR,
    awayteam VARCHAR,
    match_urls VARCHAR,
    home_urls VARCHAR,
    away_urls VARCHAR,
    league VARCHAR,
    home_team_matches JSONB,
    away_team_matches JSONB,
    head2head_matches JSONB,
    home_team_matchespattern JSONB,
    away_team_matchespattern JSONB,
    home_score_patterns JSONB,
    away_score_patterns JSONB,
    h2h_score_patterns JSONB,
    innerdetail_analysis JSONB
);'''
cursor.execute(create_query)

#Insert all the data into the table multiple rows at a time
insert_query = "INSERT INTO match_prediction (date, hometeam, awayteam, match_urls, home_urls, away_urls, league, home_team_matches, away_team_matches, head2head_matches, home_team_matchespattern, away_team_matchespattern, home_score_patterns, away_score_patterns, h2h_score_patterns, innerdetail_analysis) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"
cursor.executemany(insert_query, match_pred)

In [902]:
#Commit and close connection
connection.commit()
cursor.close()
connection.close()

---

### Referee History Analysis

In [493]:
def ref_hist_analysis(team_matches, skip=False):
    '''This function analyses the table without any filters and simple checks for pattern for a given
    role (home or away). The historic matches of the team to play the given role in the upcoming match is
    analysed for the given role'''
    
    #team_matches = json.loads(team_matches)
    team_df = pd.DataFrame(team_matches)
    team_df['home_club_goal'] = team_df['home_club_goal'].astype(int)
    team_df['away_club_goal'] = team_df['away_club_goal'].astype(int)
    
    end_string = 'Score prediction based on ref history: '
    
    if skip == True:
        for i in range(team_df.shape[0]):
            if (i%2) == 0:
                team_df.drop(i, inplace=True)
        end_string = 'Score prediction based on ref history after skipping rows: '
    
    third_recentscore = [list(team_df['home_club_goal'])[2], list(team_df['away_club_goal'])[2]]
    
    def check(dataframe, column):
        varb = list(dataframe[column])[:3]
        if len(varb) <= 1: #Checks if there's enough data to check for a pattern
            return ['-']
        else:
            #Checks for all the patterns for a particular role regardless of which team played
            if (varb[0] == varb[1]):
                return [varb[0]]
            elif ((max(varb[:2]) - min(varb[:2])) == 1):
                if 0 in varb[:2]:
                    return [max(varb[:2])+1]
                else:
                    return [max(varb[:2])+1, min(varb[:2])-1]
            elif ((max(varb[:2]) - min(varb[:2])) == 2):
                return [max(varb[:2])-1]
            else:
                return ['-']
    
    scores = []    
    scores.append(check(team_df, 'home_club_goal'))
    scores.append(check(team_df, 'away_club_goal'))
    
    combined_scores = []
    final_output = []
    if ['-'] not in scores:
        #Gets all the possible predictions from the observed patterns if any
        for score1 in scores[0]:
            for score2 in scores[1]:
                combined_scores.append([score1, score2])
                
        #Checks and removes the third most recent match outcome if it already exists in the list of possible predictions
        if third_recentscore in combined_scores:
            pass
        else:
            for score in combined_scores:
                final_string = end_string + '{} - {}'.format(score[0], score[1])
                final_output.append(final_string)
    else:
        pass
    return final_output

In [904]:
def ref_total_analysis(dataset):
    '''This function takes in an entire row of the dataset pulled from the database and 
    extracts all the observed patterns.'''
    dict_of_patterns = {'ref_patterns':[]}
    
    for i in range(dataset.shape[0]):
        ref_row = list(dataset.iloc[i,:])
        
        ref_hist = pd.DataFrame(ref_row[9])
        ref_hist['Date'] = pd.to_datetime(ref_hist['Date'], format='%d/%m/%Y')
        ref_hist[['home_club_goal', 'away_club_goal']] = ref_hist['Score'].str.split(':', 1, expand=True)
        ref_hist = ref_hist[~ref_hist['away_club_goal'].str.contains('pso')]
        ref_hist = ref_hist[~ref_hist['home_club_goal'].str.contains('pso')]
        ref_hist = ref_hist[~ref_hist['away_club_goal'].str.contains('aet')]
        ref_hist = ref_hist[~ref_hist['home_club_goal'].str.contains('aet')]
        ref_hist.sort_values(by='Date', ascending=False, inplace=True)
        ref_hist.reset_index(inplace=True)
        
        dict_of_pattern = {}
        list_of_pattern = []
        try:
            patterns = ref_hist_analysis(ref_hist)
            list_of_pattern = list_of_pattern + [pattern for pattern in patterns]
        except:
            list_of_pattern = list_of_pattern + []
        try:
            patterns = ref_hist_analysis(ref_hist, skip=True)
            list_of_pattern = list_of_pattern + [pattern for pattern in patterns]
        except:
            list_of_pattern = list_of_pattern + []
        
        for i in range(len(list_of_pattern)):
            dict_of_pattern[str(i)] = list_of_pattern[i]

        dict_of_patterns['ref_patterns'].append(json.dumps(dict_of_pattern))
    return dict_of_patterns

In [497]:
ref_additional_columns = ref_total_analysis(ref_df)

ref_modified_dataset = ref_df.copy(deep=True)
for key in ref_additional_columns.keys():
    ref_modified_dataset[key] = ref_additional_columns[key]

In [504]:
ref_modified_dataset.head(3)

Unnamed: 0,date,time,hometeam,awayteam,result,matchlink,league,refereelink,referee_matchistlink,referee_matchhistdetails,ref_patterns
0,2023-09-02 00:00:00,12:30,Sheffield United,Everton FC,2:2 (2:1),https://www.worldfootball.net/report/premier-l...,English Premier League,https://www.worldfootball.net/referee_summary/...,{'1': ['https://www.worldfootball.net/referee_...,"{'Date': ['10/06/2022', '12/07/2022', '15/09/2...",{'0': 'Score prediction based on ref history: ...
1,2023-09-02 00:00:00,15:00,Brentford FC,AFC Bournemouth,2:2 (1:1),https://www.worldfootball.net/report/premier-l...,English Premier League,https://www.worldfootball.net/referee_summary/...,{'1': ['https://www.worldfootball.net/referee_...,"{'Date': ['29/10/2022', '30/07/2022', '06/08/2...",{'0': 'Score prediction based on ref history a...
2,2023-09-02 00:00:00,15:00,Burnley FC,Tottenham Hotspur,2:5 (1:2),https://www.worldfootball.net/report/premier-l...,English Premier League,https://www.worldfootball.net/referee_summary/...,{'1': ['https://www.worldfootball.net/referee_...,"{'Date': ['21/09/2022', '24/09/2022', '27/10/2...",{'0': 'Score prediction based on ref history a...


In [503]:
list(ref_modified_dataset['ref_patterns'])[0]

{'0': 'Score prediction based on ref history: 1 - 2',
 '1': 'Score prediction based on ref history after skipping rows: 2 - 1'}

In [None]:
col_list = ['referee_matchistlink','referee_matchhistdetails']
for column in col_list:
    ref_modified_dataset[column] = ref_modified_dataset[column].apply(json.dumps)
ref_modified_dataset

In [921]:
#Extracting the data from the dataframe to load into the database multiple rows at a time
lim = ref_modified_dataset.shape[0]

ref_data = []
for i in range(lim):
    ref_data.append(ref_modified_dataset.iloc[i,:])

In [302]:
#PostgreSQL database connection parameters
connection_params = {
    "host": "ec2-34-251-233-253.eu-west-1.compute.amazonaws.com",
    "port": "5432",
    "database": "d6smqg5adrljks",
    "user": "tzjbmiqqkxkuxu",
    "password": "1630e53a904f24ef0d85affbd499bd6f3cbbc1c43c47ca8d4180ba8841ee7676"
}

#Connect to PostgreSQL
connection = psycopg2.connect(**connection_params)
cursor = connection.cursor()

#Meant to run only the first time (Drops the table if it already exists)
table_name = "ref_match_pred"
drop_query = f"DROP TABLE IF EXISTS {table_name}"
cursor.execute(drop_query)

#Create the table in the database
create_query = '''CREATE TABLE IF NOT EXISTS ref_match_pred (
    date VARCHAR,
    time VARCHAR,
    hometeam VARCHAR,
    awayteam VARCHAR,
    result VARCHAR,
    matchlink VARCHAR,
    league VARCHAR,
    refereelink VARCHAR,
    referee_matchistlink JSONB,
    referee_matchhistdetails JSONB,
    ref_patterns JSONB
    
);'''
cursor.execute(create_query)

#Insert all the data into the table multiple rows at a time
insert_query = "INSERT INTO ref_match_pred (date, time, hometeam, awayteam, result, matchlink, league, refereelink, referee_matchistlink, referee_matchhistdetails, ref_patterns) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"
cursor.executemany(insert_query, ref_data)

In [304]:
#Commit and close connection
connection.commit()
cursor.close()
connection.close()

---

### Linking Historic match dataset to Referee dataset

In [508]:
match_df.head(5)

Unnamed: 0,date,hometeam,awayteam,match_urls,home_urls,away_urls,league,home_team_matches,away_team_matches,head2head_matches,home_team_matchespattern,away_team_matchespattern
0,2023-09-02 15:00:00,Brentford,AFC Bournemouth,https://www.flashscore.nl/wedstrijd/hh2ZdWJ6/#...,https://www.flashscore.nl/wedstrijd/hh2ZdWJ6/#...,https://www.flashscore.nl/wedstrijd/hh2ZdWJ6/#...,English Premier League,"{'date': ['29.08.23', '26.08.23', '19.08.23', ...","{'date': ['29.08.23', '26.08.23', '19.08.23', ...","{'date': ['14.01.23', '01.10.22', '22.05.21', ...","{'1': {'var': [], 'date': ['26.08.2023 15:00']...","{'1': {'var': [], 'date': ['29.08.2023 19:30']..."
1,2023-09-02 15:00:00,Burnley,Tottenham Hotspur,https://www.flashscore.nl/wedstrijd/E7jreAlJ/#...,https://www.flashscore.nl/wedstrijd/E7jreAlJ/#...,https://www.flashscore.nl/wedstrijd/E7jreAlJ/#...,English Premier League,"{'date': ['30.08.23', '27.08.23', '11.08.23', ...","{'date': ['29.08.23', '26.08.23', '19.08.23', ...","{'date': ['15.05.22', '23.02.22', '27.10.21', ...","{'1': {'var': [], 'date': ['27.08.2023 14:00']...","{'1': {'var': [], 'date': ['29.08.2023 19:45']..."
2,2023-09-02 15:00:00,Chelsea,Nottingham Forest,https://www.flashscore.nl/wedstrijd/0d8k37tt/#...,https://www.flashscore.nl/wedstrijd/0d8k37tt/#...,https://www.flashscore.nl/wedstrijd/0d8k37tt/#...,English Premier League,"{'date': ['30.08.23', '25.08.23', '20.08.23', ...","{'date': ['30.08.23', '26.08.23', '18.08.23', ...","{'date': ['13.05.23', '01.01.23', '05.01.20', ...","{'1': {'var': [], 'date': ['30.08.2023 19:45']...","{'1': {'var': [], 'date': ['26.08.2023 15:00']..."
3,2023-09-02 15:00:00,Manchester City,Fulham,https://www.flashscore.nl/wedstrijd/Uy06aPu5/#...,https://www.flashscore.nl/wedstrijd/Uy06aPu5/#...,https://www.flashscore.nl/wedstrijd/Uy06aPu5/#...,English Premier League,"{'date': ['27.08.23', '19.08.23', '16.08.23', ...","{'date': ['29.08.23', '26.08.23', '19.08.23', ...","{'date': ['30.04.23', '05.11.22', '05.02.22', ...","{'1': {'var': [], 'date': ['19.08.2023 20:00']...","{'1': {}, '2': {'var': [], 'date': ['12.08.202..."
4,2023-09-02 17:30:00,Brighton,Newcastle United,https://www.flashscore.nl/wedstrijd/zHivdjZC/#...,https://www.flashscore.nl/wedstrijd/zHivdjZC/#...,https://www.flashscore.nl/wedstrijd/zHivdjZC/#...,English Premier League,"{'date': ['26.08.23', '19.08.23', '12.08.23', ...","{'date': ['27.08.23', '19.08.23', '12.08.23', ...","{'date': ['29.07.23', '18.05.23', '13.08.22', ...","{'1': {'var': [], 'date': ['26.08.2023 17:30']...","{'1': {'var': [], 'date': ['19.08.2023 20:00']..."


In [509]:
ref_df.tail(5)

Unnamed: 0,date,time,hometeam,awayteam,result,matchlink,league,refereelink,referee_matchistlink,referee_matchhistdetails
1,2023-09-02 00:00:00,15:00,Brentford FC,AFC Bournemouth,2:2 (1:1),https://www.worldfootball.net/report/premier-l...,English Premier League,https://www.worldfootball.net/referee_summary/...,{'1': ['https://www.worldfootball.net/referee_...,"{'Date': ['29/10/2022', '30/07/2022', '06/08/2..."
2,2023-09-02 00:00:00,15:00,Burnley FC,Tottenham Hotspur,2:5 (1:2),https://www.worldfootball.net/report/premier-l...,English Premier League,https://www.worldfootball.net/referee_summary/...,{'1': ['https://www.worldfootball.net/referee_...,"{'Date': ['21/09/2022', '24/09/2022', '27/10/2..."
3,2023-09-02 00:00:00,15:00,Chelsea FC,Nottingham Forest,0:1 (0:0),https://www.worldfootball.net/report/premier-l...,English Premier League,https://www.worldfootball.net/referee_summary/...,{'1': ['https://www.worldfootball.net/referee_...,"{'Date': ['04/03/2023', '01/04/2023', '30/07/2..."
4,2023-09-02 00:00:00,15:00,Manchester City,Fulham FC,5:1 (2:1),https://www.worldfootball.net/report/premier-l...,English Premier League,https://www.worldfootball.net/referee_summary/...,{'1': ['https://www.worldfootball.net/referee_...,"{'Date': ['02/06/2022', '13/09/2022', '11/10/2..."
5,2023-09-02 00:00:00,17:30,Brighton & Hove Albion,Newcastle United,0:0,https://www.worldfootball.net/report/premier-l...,English Premier League,https://www.worldfootball.net/referee_summary/...,{'1': ['https://www.worldfootball.net/referee_...,"{'Date': ['14/07/2022', '13/08/2022', '21/08/2..."


In [511]:
#Create a list to add blanks where there no match for a given team set up, and add referee analysis where there is a match
#After the loop, attach the referee analysis and referee name to the team analysis dataset.
for i in range(match_df.shape[0]):
    for j in range(ref_df.shape[0]):
        if (list(match_df.iloc[i,:])[1] in list(ref_df.iloc[j,:])[2]) & (list(match_df.iloc[i,:])[2] in list(ref_df.iloc[j,:])[3]):
            print('match_df: ', list(match_df.iloc[i,:])[1], list(match_df.iloc[i,:])[2])
            print('ref_df:   ', list(ref_df.iloc[j,:])[2], list(ref_df.iloc[j,:])[3])

match_df:  Brentford AFC Bournemouth
ref_df:    Brentford FC AFC Bournemouth
match_df:  Burnley Tottenham Hotspur
ref_df:    Burnley FC Tottenham Hotspur
match_df:  Chelsea Nottingham Forest
ref_df:    Chelsea FC Nottingham Forest
match_df:  Manchester City Fulham
ref_df:    Manchester City Fulham FC
match_df:  Brighton Newcastle United
ref_df:    Brighton & Hove Albion Newcastle United


---

In [2]:
def teamdata_extract(table):
    '''Extracting the data from the database to load into the dataframe for analysis.'''

    #PostgreSQL database connection parameters
    connection_params = {
        "host": "ec2-34-251-233-253.eu-west-1.compute.amazonaws.com",
        "port": "5432",
        "database": "d6smqg5adrljks",
        "user": "tzjbmiqqkxkuxu",
        "password": "1630e53a904f24ef0d85affbd499bd6f3cbbc1c43c47ca8d4180ba8841ee7676"
    }

    #Connect to PostgreSQL
    connection = psycopg2.connect(**connection_params)
    cursor = connection.cursor()

    #Create the table in the database
    create_query = f"SELECT * FROM {table}"
    cursor.execute(create_query)

    # Fetch all rows
    rows = cursor.fetchall()
    column_names = [desc[0] for desc in cursor.description]

    #Converting the data extracted to a DataFrame for analysis
    df = pd.DataFrame(rows, columns=column_names)
    df

    #Commit and close connection
    connection.commit()
    cursor.close()
    connection.close()

    return df

In [4]:
match_pred = teamdata_extract('match_prediction')
match_pred.head(3)

Unnamed: 0,date,hometeam,awayteam,match_urls,home_urls,away_urls,league,home_team_matches,away_team_matches,head2head_matches,home_team_matchespattern,away_team_matchespattern,home_score_patterns,away_score_patterns,h2h_score_patterns,innerdetail_analysis
0,2023-09-30 11:30:00,Aston Villa,Brighton,https://www.flashscore.nl/wedstrijd/E3fbIwnp/#...,https://www.flashscore.nl/wedstrijd/E3fbIwnp/#...,https://www.flashscore.nl/wedstrijd/E3fbIwnp/#...,English Premier League,"{'date': ['27.09.23', '24.09.23', '21.09.23', ...","{'date': ['27.09.23', '24.09.23', '21.09.23', ...","{'date': ['28.05.23', '08.12.22', '13.11.22', ...","{'0': {'team': {'var': [], 'date': ['27.09.202...","{'0': {'team': {'var': [], 'date': ['27.09.202...",{'0': 'Score prediction for home team using ho...,{'0': 'Score prediction for away role using aw...,{'0': 'Score prediction for both team using he...,"{'first_half': ['home team'], 'injury_time': [..."
1,2023-09-30 14:00:00,AFC Bournemouth,Arsenal,https://www.flashscore.nl/wedstrijd/xr3WMJwT/#...,https://www.flashscore.nl/wedstrijd/xr3WMJwT/#...,https://www.flashscore.nl/wedstrijd/xr3WMJwT/#...,English Premier League,"{'date': ['27.09.23', '24.09.23', '17.09.23', ...","{'date': ['27.09.23', '24.09.23', '20.09.23', ...","{'date': ['04.03.23', '20.08.22', '27.01.20', ...","{'0': {'team': {'var': [], 'date': ['27.09.202...","{'0': {'team': {'var': [], 'date': ['27.09.202...",{'0': 'Score prediction for home team using ho...,{'0': 'Score prediction for away role using aw...,{'0': 'Score prediction for both teams using h...,"{'first_half': ['away team'], 'injury_time': [..."
2,2023-09-30 14:00:00,Everton,Luton Town FC,https://www.flashscore.nl/wedstrijd/8Qg2Hc1j/#...,https://www.flashscore.nl/wedstrijd/8Qg2Hc1j/#...,https://www.flashscore.nl/wedstrijd/8Qg2Hc1j/#...,English Premier League,"{'date': ['27.09.23', '23.09.23', '17.09.23', ...","{'date': ['26.09.23', '23.09.23', '16.09.23', ...","{'date': ['31.10.07', '24.10.06', '14.03.92', ...","{'0': {'team': {'var': [], 'date': ['17.09.202...","{'0': {'team': {'var': [], 'date': ['26.09.202...",{'0': 'Score prediction for home role using ho...,{'0': 'Score prediction for away team using aw...,{'0': 'Score prediction for both team using he...,"{'first_half': ['both teams'], 'injury_time': ..."


In [5]:
match_pred_1 = teamdata_extract('historic_match')
match_pred_1.head(3)

Unnamed: 0,date,hometeam,awayteam,match_urls,home_urls,away_urls,league,home_team_matches,away_team_matches,head2head_matches,home_team_matchespattern,away_team_matchespattern
0,2023-09-30 11:30:00,Aston Villa,Brighton,https://www.flashscore.nl/wedstrijd/E3fbIwnp/#...,https://www.flashscore.nl/wedstrijd/E3fbIwnp/#...,https://www.flashscore.nl/wedstrijd/E3fbIwnp/#...,English Premier League,"{'date': ['27.09.23', '24.09.23', '21.09.23', ...","{'date': ['27.09.23', '24.09.23', '21.09.23', ...","{'date': ['28.05.23', '08.12.22', '13.11.22', ...","{'0': {'team': {'var': [], 'date': ['27.09.202...","{'0': {'team': {'var': [], 'date': ['27.09.202..."
1,2023-09-30 14:00:00,AFC Bournemouth,Arsenal,https://www.flashscore.nl/wedstrijd/xr3WMJwT/#...,https://www.flashscore.nl/wedstrijd/xr3WMJwT/#...,https://www.flashscore.nl/wedstrijd/xr3WMJwT/#...,English Premier League,"{'date': ['27.09.23', '24.09.23', '17.09.23', ...","{'date': ['27.09.23', '24.09.23', '20.09.23', ...","{'date': ['04.03.23', '20.08.22', '27.01.20', ...","{'0': {'team': {'var': [], 'date': ['27.09.202...","{'0': {'team': {'var': [], 'date': ['27.09.202..."
2,2023-09-30 14:00:00,Everton,Luton Town FC,https://www.flashscore.nl/wedstrijd/8Qg2Hc1j/#...,https://www.flashscore.nl/wedstrijd/8Qg2Hc1j/#...,https://www.flashscore.nl/wedstrijd/8Qg2Hc1j/#...,English Premier League,"{'date': ['27.09.23', '23.09.23', '17.09.23', ...","{'date': ['26.09.23', '23.09.23', '16.09.23', ...","{'date': ['31.10.07', '24.10.06', '14.03.92', ...","{'0': {'team': {'var': [], 'date': ['17.09.202...","{'0': {'team': {'var': [], 'date': ['26.09.202..."


In [None]:
additional_columns_1 = matchscore_total_analysis(match_pred_1)

modified_dataset_1 = match_pred_1.copy(deep=True)
for key in additional_columns_1.keys():
    modified_dataset_1[key] = additional_columns_1[key]
modified_dataset_1

In [6]:
eredivisie = match_pred[match_pred['league'] == 'Eredivisie']
eredivisie

Unnamed: 0,date,hometeam,awayteam,match_urls,home_urls,away_urls,league,home_team_matches,away_team_matches,head2head_matches,home_team_matchespattern,away_team_matchespattern,home_score_patterns,away_score_patterns,h2h_score_patterns,innerdetail_analysis
55,2023-09-30 14:30:00,Feyenoord,Go Ahead Eagles,https://www.flashscore.nl/wedstrijd/zPDP9air/#...,https://www.flashscore.nl/wedstrijd/zPDP9air/#...,https://www.flashscore.nl/wedstrijd/zPDP9air/#...,Eredivisie,"{'date': ['27.09.23', '19.09.23', '16.09.23', ...","{'date': ['27.09.23', '22.09.23', '17.09.23', ...","{'date': ['14.05.23', '23.12.22', '03.09.22', ...","{'0': {'team': {'var': ['71', '92'], 'date': [...","{'0': {'team': {'var': [], 'date': ['27.09.202...",{'0': 'Score prediction for home team using ho...,{'0': 'Score prediction for away team using aw...,{'0': 'Score prediction for both team using he...,"{'first_half': ['home team'], 'injury_time': [..."
56,2023-09-30 16:45:00,PSV,FC Volendam,https://www.flashscore.nl/wedstrijd/EFCT8J6l/#...,https://www.flashscore.nl/wedstrijd/EFCT8J6l/#...,https://www.flashscore.nl/wedstrijd/EFCT8J6l/#...,Eredivisie,"{'date': ['27.09.23', '23.09.23', '20.09.23', ...","{'date': ['23.09.23', '16.09.23', '03.09.23', ...","{'date': ['16.04.23', '31.08.22', '19.01.21', ...","{'0': {'team': {'var': [], 'date': ['27.09.202...","{'0': {'team': {'var': [], 'date': ['16.09.202...",{'0': 'Score prediction for home team using ho...,{'0': 'Score prediction for away team using aw...,{},"{'first_half': ['home team'], 'injury_time': [..."
57,2023-09-30 19:00:00,FC Twente,sc Heerenveen,https://www.flashscore.nl/wedstrijd/445t6Hj7/#...,https://www.flashscore.nl/wedstrijd/445t6Hj7/#...,https://www.flashscore.nl/wedstrijd/445t6Hj7/#...,Eredivisie,"{'date': ['27.09.23', '24.09.23', '17.09.23', ...","{'date': ['23.09.23', '16.09.23', '02.09.23', ...","{'date': ['04.06.23', '01.06.23', '04.03.23', ...","{'0': {'team': {'var': [], 'date': ['27.09.202...","{'0': {'team': {'var': [], 'date': ['16.09.202...",{'0': 'Score prediction for home team using ho...,{},{'0': 'Score prediction for both roles using h...,"{'first_half': ['home team'], 'injury_time': [..."
58,2023-09-30 19:00:00,RKC Waalwijk,Ajax,https://www.flashscore.nl/wedstrijd/As6x7cy1/#...,https://www.flashscore.nl/wedstrijd/As6x7cy1/#...,https://www.flashscore.nl/wedstrijd/As6x7cy1/#...,Eredivisie,"{'date': ['24.09.23', '16.09.23', '02.09.23', ...","{'date': ['27.09.23', '21.09.23', '17.09.23', ...","{'date': ['12.02.23', '22.10.22', '06.03.22', ...","{'0': {'team': {'var': [], 'date': ['24.09.202...","{'0': {'team': {'var': [], 'date': ['17.09.202...",{'0': 'Score prediction for home role using ho...,{'0': 'Score prediction for away team using aw...,{'0': 'Score prediction for both teams using h...,"{'first_half': ['away team'], 'injury_time': [..."
120,2023-10-01 12:30:00,Excelsior Rotterdam,Sparta Rotterdam,https://www.flashscore.nl/wedstrijd/KCXlChDt/#...,https://www.flashscore.nl/wedstrijd/KCXlChDt/#...,https://www.flashscore.nl/wedstrijd/KCXlChDt/#...,Eredivisie,"{'date': ['23.09.23', '17.09.23', '02.09.23', ...","{'date': ['24.09.23', '17.09.23', '01.09.23', ...","{'date': ['05.03.23', '14.01.23', '31.07.21', ...","{'0': {'team': {'var': [], 'date': ['17.09.202...","{'0': {'team': {'var': [], 'date': ['17.09.202...",{'0': 'Score prediction for home team using ho...,{'0': 'Score prediction for away role using aw...,{'0': 'Score prediction for both teams using h...,{}
121,2023-10-01 12:30:00,Heracles Almelo,PEC Zwolle,https://www.flashscore.nl/wedstrijd/rmALTKrE/#...,https://www.flashscore.nl/wedstrijd/rmALTKrE/#...,https://www.flashscore.nl/wedstrijd/rmALTKrE/#...,Eredivisie,"{'date': ['28.09.23', '23.09.23', '16.09.23', ...","{'date': ['24.09.23', '17.09.23', '02.09.23', ...","{'date': ['15.01.23', '14.10.22', '26.02.22', ...","{'0': {'team': {'var': ['18'], 'date': ['16.09...","{'0': {'team': {'var': [], 'date': ['02.09.202...",{'0': 'Score prediction for home team using ho...,{'0': 'Score prediction for away team using aw...,{'0': 'Score prediction for both team using he...,"{'first_half': ['away team'], 'injury_time': [..."
122,2023-10-01 14:45:00,AZ,Fortuna Sittard,https://www.flashscore.nl/wedstrijd/EZWhBCSn/#...,https://www.flashscore.nl/wedstrijd/EZWhBCSn/#...,https://www.flashscore.nl/wedstrijd/EZWhBCSn/#...,Eredivisie,"{'date': ['28.09.23', '24.09.23', '21.09.23', ...","{'date': ['22.09.23', '16.09.23', '06.09.23', ...","{'date': ['16.04.23', '22.01.23', '16.01.22', ...","{'0': {'team': {'var': [], 'date': ['28.09.202...","{'0': {'team': {'var': [], 'date': ['22.09.202...",{'0': 'Score prediction for home team using ho...,{'0': 'Score prediction for away team using aw...,{'0': 'Score prediction for both teams using h...,"{'first_half': ['home team'], 'injury_time': [..."


In [65]:
list(eredivisie['home_score_patterns'])[5]

{'0': 'Score prediction for home team using home historic match scores, for only matches where home team played home role: 2 - 2',
 '1': 'Score prediction for home team using home historic match scores filtered by Eredivisie, for only matches where home team played home role: 2 - 2',
 '2': 'Score prediction for home role using home historic match scores, regardless of role played by home team: 3 - 3',
 '3': 'Score prediction for home role using home historic match scores, regardless of role played by home team: 3 - 0',
 '4': 'Score prediction for home role using home historic match scores, regardless of role played by home team: 0 - 3',
 '5': 'Score prediction for home role using home historic match scores, regardless of role played by home team: 0 - 0',
 '6': 'Score prediction for home role using home historic match scores after skipping rows, regardless of role played by home team: 4 - 3',
 '7': 'Score prediction for home role using home historic match scores after skipping rows, reg

In [66]:
pd.DataFrame(list(eredivisie['home_team_matches'])[5])

Unnamed: 0,date,league,away_club,home_club,away_club_goal,home_club_goal
0,28.09.23,ERE,Heracles Almelo,AZ,1,1
1,23.09.23,ERE,Heracles Almelo,FC Volendam,2,2
2,16.09.23,ERE,FC Utrecht,Heracles Almelo,3,1
3,02.09.23,ERE,Excelsior Rotterdam,Heracles Almelo,1,3
4,18.08.23,ERE,N.E.C.,Heracles Almelo,1,2
5,12.08.23,ERE,Heracles Almelo,Ajax,1,4
6,05.08.23,V,FC Volendam,Heracles Almelo,3,1
7,28.07.23,V,Excelsior Rotterdam,Heracles Almelo,1,2
8,25.07.23,V,FC Emmen,Heracles Almelo,0,1
9,22.07.23,V,Heracles Almelo,SC Cambuur,1,4


In [69]:
list(eredivisie['away_score_patterns'])[1]

{'0': 'Score prediction for away team using away historic match scores, for only matches where away team played away role: 5 - 1',
 '1': 'Score prediction for away team using away historic match scores, for only matches where away team played away role: 2 - 1',
 '2': 'Score prediction for away team using away historic match scores after skipping rows, for only matches where away team played away role: 5 - 1',
 '3': 'Score prediction for away team using away historic match scores after skipping rows, for only matches where away team played away role: 2 - 1',
 '4': 'Score prediction for away role using away historic match scores, regardless of role played by away team: 4 - 3',
 '5': 'Score prediction for away role using away historic match scores, regardless of role played by away team: 4 - 0',
 '6': 'Score prediction for away role using away historic match scores, regardless of role played by away team: 1 - 3',
 '7': 'Score prediction for away role using away historic match scores, rega

In [70]:
pd.DataFrame(list(eredivisie['away_team_matches'])[1])

Unnamed: 0,date,league,away_club,home_club,away_club_goal,home_club_goal
0,23.09.23,ERE,Heracles Almelo,FC Volendam,2,2
1,16.09.23,ERE,FC Volendam,Fortuna Sittard,1,3
2,03.09.23,ERE,FC Twente,FC Volendam,2,0
3,19.08.23,ERE,FC Volendam,Go Ahead Eagles,1,4
4,11.08.23,ERE,Vitesse,FC Volendam,2,1
5,05.08.23,V,FC Volendam,Heracles Almelo,3,1
6,01.08.23,V,Alanyaspor,FC Volendam,1,1
7,29.07.23,V,PAS Lamia,FC Volendam,0,2
8,22.07.23,V,FC Volendam,KV Kortrijk,1,2
9,15.07.23,V,KAA Gent,FC Volendam,2,1


In [44]:
pred = windrawloss_analysis(pd.DataFrame(list(eredivisie['away_team_matches'])[6]), 'Fortuna Sittard', 'away')
pred

#Comment out the win-loss-draw function pending when it might be needed in the future.
    #for a draw condition, the prediction can only hold if score has just one score gap
    #Only when the win-draw-loss is an option we filter with it
    

#Given a set of matches, the prime number is the most occuring score and if the prediction doesn't have the most occuring score, it is discarded.
    #The prime number must be present in both matches or else we don't have a prime number.
    
    #In the case of no prime number, we discard the prediction
    #In the case where ther is a prime number but it isn't in the prediction, we also discard

['Score prediction for away team using away historic match scores, based on win-loss-draw pattern: draw, 3 - 3']

In [53]:
list(eredivisie['h2h_score_patterns'])[1]

{}

In [54]:
pd.DataFrame(list(eredivisie['head2head_matches'])[1])

Unnamed: 0,date,league,away_club,home_club,away_club_goal,home_club_goal
0,16.04.23,ERE,PSV,FC Volendam,3,2
1,31.08.22,ERE,FC Volendam,PSV,1,7
2,19.01.21,KNV,PSV,FC Volendam,2,0
3,26.10.17,KNV,PSV,FC Volendam,2,0
4,08.02.09,ERE,PSV,FC Volendam,5,3
5,27.09.08,ERE,FC Volendam,PSV,0,1
6,25.01.05,KNV,FC Volendam,PSV,0,4
7,02.05.04,ERE,PSV,FC Volendam,5,0
8,06.12.03,ERE,FC Volendam,PSV,0,7
9,18.04.98,ERE,FC Volendam,PSV,0,10


In [None]:
#Athletic Bibao - Getafe
    #5-1 came from analysing history of home team (Athletico), for only matches they played as home role
        #3,4 gives rise to 5 or 2, while 0,2 gives rise to 1, hence the combination is either 5-1 or 2-1
    #5-1 came from analysing history of home team (Athletico), for home role regardless of team after skipping
        #3,4 gives rise to 5 or 2, while 0,2 gives rise to 1, hence the combination is either 5-1 or 2-1
    #5-1 came from analysing history of home team (Athletico), for home team regardless of role played after skipping    
        #3,4 gives rise to 5 or 2, while 0,2 gives rise to 1, hence the combination is either 5-1 or 2-1

#Real Madrid - Las Palmas: Predictions are not considered interesting with respect to the filtering condition.
    #Also the system detected the 3-2, but not the 2-1, so please explain.
    
#Cadiz, I don't see the 1-1, or the 1-0 on any of the patterns accounted for.
    #So please explain to see if there is something I didn't account for.
    
    #0-1 came from analysing the history of the away team (Rayo Vallecano) for only games it played as the away team
        #2,1 gives rise to 3 or 0, while 2,0 gives rise to 1, hence the combination is wither 3-1 or 0-1
    #0-2 came from analysing the history of the away team (Rayo Vallecano) for away role (regardless of playing team)
        #1,2 gives rise to 3 or 0, while 1,0 gives rise to 2, hence the combination is either 3-2 or 0-2
    #2-0 came from analysing the history of the away team (Rayo Vallecano) for away team (regardless of role they played)
        #1,0 gives rise to 2, while 1,2 gives rise to 3 or 0, hence the combination is either 2-3 or 2-0
    #2-1 came from analysing the history of the away team (Rayo Vallecano) for away team (regardless of role they played) after skipping
        #0,1 gives rise to 2, while 2,0 gives rise to 1, hence the combination is 2-1
        
#Catanzaro - Cittadella
    #Head2Head history wasn't available at the website
    
    #2-1 came from analysing history of away team (Cittadella), for away role (regardless of playing team)
        #0,1 gives rise to 2, while 3,2 gives rise to 1 or 4, hence the combination is either 2-4 or 2-1
    #2-1 come from analysing history of away team (Cittadella), for away team (regardless of role played)
        #3,1 gives rise to 2, while 0,2 gives rise to 1, hence the combination is 2-1
        
    #I don't see 2-0 by any of the patterns defined, please explain
    
#Mallorca - Barcelona had patterns but wasn't considered interesting by any of the conditions we defined

#Como - Sampdoria had patterns but wasn't considered interesting by any of the conditions we defined

#Parma - Bari had patterns but wasn't considered interesting by any of the conditions we defined

#For any particular tab without prediction in an available League
    #-Either there were no patterns noticed for the historic matches in that section
    #-Or the page didn't load properly during extraction which led to no extraction at all.

#For switching the platform to dutch language.
    #The framework used was a very simple one without a lot of fucntionalities just to present the predcition in a better form than sending all to email
    #However, I will do some reserach and see if there's a way to do this given the limitations.
    
    #Important Note: Translation can be done on the page using Google Translate to any language of choice (including Dutch)

In [None]:
#Heracles Almelo - PEC Zwolle
    #2-0 came from analysing history of away team (PEC Zwolle), for only matches they played as away role
        #1,3 gives rise to 2, while 2,1 gives rise to 3 or 0, hence the combination is either 2-3 or 2-0
        
#AZ - Fortuna Sittard
    #4-1 came from analysing history of away team (Fortuna Sittard) for only games played in Erefivisie, for only matches they played as away role
        #3,2 gives rise to 4 or 1, while 0,2 gives rise to 1, hence the combination is either 4-1 or 1-1
    #0-2 came from analysing history of home team (AZ), for only matches they played as home role
        #1,2 gives rise to 3 or 0, while 1,0 gives rise to 2, hence the combination is either 3-2 or 0-2
    #0-2 came from analysing history of home team (AZ) for only games played in Erefivisie, for only matches they played as home role
        #1,2 gives rise to 3 or 0, while 1,0 gives rise to 2, hence the combination is either 3-2 or 0-2 
        
#PSV - FC Volendam (the concern raised was 3-1, but the prediction was actually 1-3)
    #1-3 came from analysing history of away team (Fortuna Sittard) for away role (regardless of playing team)
        #2,3 gives rise to 4 or 1, while 2,1 gives rise to 3 or 0, hence the combination is either 4-3, 4-0, 1-3 or 1-0
    #1-3 came from analysing history of away team (Fortuna Sittard) for only games played in Erefivisie, for away role (regardless of playing team)
        #2,3 gives rise to 4 or 1, while 2,1 gives rise to 3 or 0, hence the combination is either 4-3, 4-0, 1-3 or 1-0
    #1-3 came from analysing history of away team (Fortuna Sittard) for away team (regardless of role played)
        #2,3 gives rise to 4 or 1, while 2,1 gives rise to 3 or 0, hence the combination is either 4-3, 4-0, 1-3 or 1-0
    #1-3 came from analysing history of away team (Fortuna Sittard) for only games played in Erefivisie, for away team (regardless of role played)
        #2,3 gives rise to 4 or 1, while 2,1 gives rise to 3 or 0, hence the combination is either 4-3, 4-0, 1-3 or 1-0

In [None]:
#Expansion

#Extracting history of all clubs in all leagues, and then get hisorical match set ups
    #Check the predictions for each of these match set up as we would have real-time to see most occurent