# Statistics Football Data Scraper

This script will look into the future football matches and scrape important statistics to help betting using ClubELO API.

Documentation Link: http://clubelo.com/API 

### Importing Libraries

In [1]:

import requests
import csv
from io import StringIO
import pandas as pd
import os
from datetime import datetime
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import urllib.parse

### Making API Pull and Creating Dataframe

In [2]:
url = 'http://api.clubelo.com/Fixtures'
response = requests.get(url)

if response.status_code != 200:
    print("Unable to retrieve data.\n")

with open("Fixtures.csv", "wb") as f:
    f.write(response.content)

### Scraping Win, Loss, Draw %

This part scrapes the probability of win, loss, draw for each team based on the excel file donwloaded from the API request, outputing an excel file with those percentages.

In [3]:
data = StringIO(response.text)
df = pd.read_csv(data, sep=",")

df["Game_ID"] = df["Home"] + " vs " + df["Away"]
df['Home Win%'] = df.apply(lambda x: round((x['GD=1'] + x['GD=2'] + x['GD=3'] + x['GD=4'] + x['GD=5'] + x['GD>5']) * 100, 2) if (x['GD=-1'] + x['GD=0'] + x['GD=1'] + x['GD=2'] + x['GD=3'] + x['GD=4'] + x['GD>5']) > 0 else '', axis=1)
df['Away Win%'] = df.apply(lambda x: round((x['GD<-5'] + x['GD=-5'] + x['GD=-4'] + x['GD=-3'] + x['GD=-2'] + x['GD=-1']) * 100, 2) if (x['GD=-1'] + x['GD=0'] + x['GD=1'] + x['GD=2'] + x['GD=3'] + x['GD=4'] + x['GD>5']) > 0 else '', axis=1)
df['Draw%'] = df.apply(lambda x: round(x['GD=0'] * 100, 2) if x['GD=0'] != 0 else '', axis=1)

# Create a new DataFrame with selected columns
new_df = df[['Game_ID', 'Home Win%', 'Away Win%', 'Draw%']]

# Save the new DataFrame to an Excel file directly with the name "WinLossDraw.xlsx"
new_df.to_excel('WinLossDraw.xlsx', sheet_name='Fixtures', index=False)

### Creating Outcomes Table, Stats for Last Games and Performance
 
The code generates an outcomes table for football games based on unique Game_IDs. It also scrapes the last games' different outcomes  (draws, wins, and losses) for the home team and away team in each game. Additionally, the code uses web scraping to gather the results of their last 5 games agains't other teams (won, lost, or drawn).

In [6]:
# Iterate over each unique Game_ID
unique_game_ids = df['Game_ID'].unique()
# Get the total number of games
total_games = int(len(unique_game_ids))
print("Total number of games found:", total_games)

for game_id in unique_game_ids:
    game_df = df[df['Game_ID'] == game_id].copy()  # Filter DataFrame for the current Game_ID
    game_df.reset_index(drop=True, inplace=True)   # Reset the index of the filtered DataFrame

    # Create a new DataFrame for the current Game_ID
    result_df = pd.DataFrame(index=range(8), columns=range(8))
    result_df.iloc[0, 0] = "Goals"
    result_df.iloc[0, 1] = "0"
    result_df.iloc[0, 2] = "1"
    result_df.iloc[0, 4] = "3"
    result_df.iloc[0, 5] = "4"
    result_df.iloc[0, 6] = "5"
    result_df.iloc[0, 7] = "6"
    result_df.iloc[1, 0] = "0"
    result_df.iloc[2, 0] = "1"
    result_df.iloc[4, 0] = "3"
    result_df.iloc[5, 0] = "4"
    result_df.iloc[6, 0] = "5"
    result_df.iloc[7, 0] = "6"
    
    result_df.iloc[0, 3] = game_df.loc[0, 'Home'] + "| 2" # Home Team
    result_df.iloc[3, 0] = game_df.loc[0, 'Away'] + "| 2" # Away Team

    result_df.iloc[1, 1] = game_df.loc[0, 'R:0-0'] *100 # Draw 0-0
    result_df.iloc[2, 1] = game_df.loc[0, 'R:0-1'] *100 # Home team loses 0-1
    result_df.iloc[3, 1] = game_df.loc[0, 'R:0-2'] *100 # Home team loses 0-2
    result_df.iloc[4, 1] = game_df.loc[0, 'R:0-3'] *100 # Home team loses 0-3
    result_df.iloc[5, 1] = game_df.loc[0, 'R:0-4'] *100 # Home team loses 0-4
    result_df.iloc[6, 1] = game_df.loc[0, 'R:0-5'] *100 # Home team loses 0-5
    result_df.iloc[7, 1] = game_df.loc[0, 'R:0-6'] *100 # Home team loses 0-6

    result_df.iloc[1, 2] = game_df.loc[0, 'R:1-0'] *100 # Home team wins 1-0
    result_df.iloc[1, 3] = game_df.loc[0, 'R:2-0'] *100 # Home team wins 2-0
    result_df.iloc[1, 4] = game_df.loc[0, 'R:3-0'] *100 # Home team wins 3-0
    result_df.iloc[1, 5] = game_df.loc[0, 'R:4-0'] *100 # Home team wins 4-0
    result_df.iloc[1, 6] = game_df.loc[0, 'R:5-0'] *100 # Home team wins 5-0
    result_df.iloc[1, 7] = game_df.loc[0, 'R:6-0'] *100 # Home team wins 6-0

    result_df.iloc[2, 2] = game_df.loc[0, 'R:1-1'] *100 # Draw 1-1
    result_df.iloc[3, 2] = game_df.loc[0, 'R:1-2'] *100 # Home team loses 1-2
    result_df.iloc[4, 2] = game_df.loc[0, 'R:1-3'] *100 # Home team loses 1-3
    result_df.iloc[5, 2] = game_df.loc[0, 'R:1-4'] *100 # Home team loses 1-4
    result_df.iloc[6, 2] = game_df.loc[0, 'R:1-5'] *100 # Home team loses 1-5

    result_df.iloc[2, 3] = game_df.loc[0, 'R:2-1'] *100 # Home team wins 2-1
    result_df.iloc[3, 3] = game_df.loc[0, 'R:2-2'] *100 # Draw 2-2
    result_df.iloc[4, 3] = game_df.loc[0, 'R:2-3'] *100 # Home team loses 2-3
    result_df.iloc[5, 3] = game_df.loc[0, 'R:2-4'] *100 # Home team loses 2-4

    result_df.iloc[2, 4] = game_df.loc[0, 'R:3-1'] *100 # Home team wins 3-1
    result_df.iloc[3, 4] = game_df.loc[0, 'R:3-2'] *100 # Home team wins 3-2
    result_df.iloc[4, 4] = game_df.loc[0, 'R:3-3'] *100 # Draw 3-3

    result_df.iloc[2, 5] = game_df.loc[0, 'R:4-1'] *100 # Home team wins 4-1
    result_df.iloc[3, 5] = game_df.loc[0, 'R:4-2'] *100 # Home team wins 4-2

    result_df.iloc[2, 6] = game_df.loc[0, 'R:5-1'] *100 # Home team wins 5-1

     # Fill NaN values with "#"
    result_df.fillna("#", inplace=True)


    # Calculating most likely outcomes between the 2 teams
    rows = [1, 2, 3, 4, 5, 6, 7]
    columns = [1, 2, 3, 4, 5, 6, 7]
    percentage_of_outcome = 0

    home_win_percentage = result_df.iloc[1, 2]+result_df.iloc[1, 3]+result_df.iloc[1, 4]+result_df.iloc[1, 5]+result_df.iloc[1, 6]+result_df.iloc[1, 7]
    +result_df.iloc[2, 3]+result_df.iloc[2, 4]+result_df.iloc[3, 4]+result_df.iloc[2, 5]+result_df.iloc[3, 5]+result_df.iloc[2, 6]

    away_win_percentage = result_df.iloc[2, 1]+result_df.iloc[3, 1]+result_df.iloc[4, 1]+result_df.iloc[5, 1]+result_df.iloc[6, 1]+result_df.iloc[7, 1]
    +result_df.iloc[3, 2]+result_df.iloc[4, 2]+result_df.iloc[5, 2]+result_df.iloc[6, 2]+result_df.iloc[4, 3]+result_df.iloc[5, 3]
    
    draw_percentage = result_df.iloc[1, 1]+result_df.iloc[2, 2]+result_df.iloc[3, 3]+result_df.iloc[4, 4]

    for i in rows:
        for j in columns:
            if result_df.iloc[i,j] == '#':
                percentage_of_outcome = percentage_of_outcome
            elif    result_df.iloc[i,j] > percentage_of_outcome:
                percentage_of_outcome = result_df.iloc[i,j]
                row = i
                column = j
            
    likely_outcome = str(column-1)+" - "+str(row-1) 

    print("Fixture: ",game_id)
    print("Most likely outcome: ", likely_outcome ," with ", round(percentage_of_outcome,2),"%")
    print("Probability of Home Team win: ", round(home_win_percentage,2),"%")
    print("Probability of Away Team win: ", round(away_win_percentage,2),"%")
    print("Probability of Draw: ",round(draw_percentage,2),"%")
    # Note that the percentages rarely will  add up to 100% because there are outcomes that arent featured in the outcomes table

    # Print the resulting DataFrame for the current Game_ID
    print(f"Table for Game_ID: {game_id}")

    # Print the DataFrame without row indices and without column header
    result_df_reset = result_df.reset_index(drop=True)
    print(result_df_reset.to_string(index=False, header=False))

    # Get the team names from the first row of game_df
    teams_participating = []

    for i in range(len(game_df)):
        home_team = game_df.loc[i, 'Home']
        away_team = game_df.loc[i, 'Away']
        
        teams_participating.append(home_team)
        teams_participating.append(away_team)

    # Create a Selenium WebDriver instance
    driver = webdriver.Chrome()

    for team in teams_participating:
        encoded_team = urllib.parse.quote(team.replace(" ", ""))
        
        # Get data from each team
        url = f"http://clubelo.com/{encoded_team}" 

        response = requests.get(url)
        time.sleep(5)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Coordinates of the Results Column
        home_goals_coordinates = [(615, y) for y in range(63, 364, 20)]
        away_goals_coordinates = [(625, y) for y in range(63, 364, 20)]

        home_goals = []
        away_goals = []

        # Retrieve number of Home Team goals scored
        for coordinate in home_goals_coordinates:
            x, y = coordinate
            try:
                score_element = soup.find("text", {"x": str(x), "y": str(y)})
                score_text = score_element.text.strip()
                home_goals.append(int(score_text))
            except ValueError:
                home_goals.append(None)

            # For the cases where the table is incomplete
            except AttributeError:
                pass

        # Retrieve number of Away Team goals scored
        for coordinate in away_goals_coordinates:
            x, y = coordinate
            try:
                score_element = soup.find("text", {"x": str(x), "y": str(y)})
                score_text = score_element.text.strip()
                away_goals.append(int(score_text))
            except ValueError:
                away_goals.append(None)

            # For the cases where the table is incomplete
            except AttributeError:
                pass

        # Calculate outcomes
        games_won = 0
        games_lost = 0
        games_draw = 0

        start_ind = None
        last_5_games = ""

        for i in range(len(home_goals)):
            if home_goals[i] is not None and away_goals[i] is not None:
                # For cases where the first lines of the table aren't results
                if start_ind is None:
                    start_ind = i

                if home_goals[i] > away_goals[i]:
                    games_won += 1
                    if i >= start_ind and i < start_ind + 5:
                        last_5_games += "W"
                elif home_goals[i] == away_goals[i]:
                    games_draw += 1
                    if i >= start_ind and i < start_ind + 5:
                        last_5_games += "D"
                else:
                    games_lost += 1
                    if i >= start_ind and i < start_ind + 5:
                        last_5_games += "L"

        print(f"Team: {team}")
        print("Games won: ", games_won)
        print("Games lost: ", games_lost)
        print("Games draw: ", games_draw)
        print("Last 5 games: ", last_5_games)
        print("------------------------------")

driver.quit()

Total number of games found: 102
Most likely outcome:  1 - 1  with  13.11 %
Probability of Home Team win:  22.78 %
Probability of Away Team win:  14.27 %
Probability of Draw:  27.61 %
Table for Game_ID: LASK vs Rapid Wien
        Goals     0      1 LASK| 2     3     4     5     6
            0  7.62   9.68    7.26  3.64   1.4  0.64  0.16
            1  7.25  13.11    9.89  5.14  1.43  0.56     #
Rapid Wien| 2  4.56   6.67    5.53  2.64   1.0     #     #
            3  1.86    2.4    1.91  1.35     #     #     #
            4  0.43   0.99    0.41     #     #     #     #
            5  0.13   0.21       #     #     #     #     #
            6  0.04      #       #     #     #     #     #


Team: LASK
Games won:  7
Games lost:  4
Games draw:  3
Last 5 games:  WDLDW
------------------------------
Team: Rapid Wien
Games won:  3
Games lost:  5
Games draw:  6
Last 5 games:  WLDLL
------------------------------
Most likely outcome:  1 - 1  with  12.27 %
Probability of Home Team win:  29.64 %
Pr

KeyboardInterrupt: 

### Deleting Unnecessary Files

Deletes all the text and excel files created that are useless in the final product.

In [None]:

# Delete the files
files_to_delete = [".~lock.Fixtures.csv#", "Fixtures.csv", "WinLossDraw.xlsx"]
for file_name in files_to_delete:
    if os.path.exists(file_name):
        os.remove(file_name)