In [1]:
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
base_url = "https://www.football-data.co.uk/"

leagues = {
    "england": "Premier League",
    "germany": "Bundesliga 1",
    "italy": "Serie A",
    "spain": "La Liga Primera Division",
    "france": "Le Championnat"
}

aliases = {
    "F1": "france_division_1",
    "E0": "england_division_1",
    "D1": "germany_division_1",
    "I1": "italy_division_1",
    "SP1": "spain_division_1"
}

file_names = []

for country, league_name in leagues.items():
    

    league_html = requests.get(base_url + country + "m.php")
    soup = BeautifulSoup(league_html.text, 'lxml')
    league_data_links = soup.find_all(lambda x : league_name in x.contents)

    links = [tag['href'] for tag in league_data_links][:22]
    
    for league_url in links:
        
        # generate file name
        league_info = league_url.split(".csv")[0].split("/")[1:]
        year_str = league_info[0]
        year = year_str[:2] + "-" + year_str[2:]
        league_name = aliases[league_info[1]]
        file_name = league_name + "_" + year + ".csv"
        file_names.append(file_name)
        
        # call api and write data to file
        league_data = requests.get(base_url + league_url)
        with open("data/" + file_name, mode='wb') as f:
            f.write(league_data.content)


In [3]:
for country in leagues.keys():
    country_dfs = [] # league-specific dataframe
    for file in file_names:
        if country in file:
            df = pd.read_csv("data/" + file, error_bad_lines=False, warn_bad_lines=False, encoding='latin1')
            df["Date"] = pd.to_datetime(df["Date"], dayfirst=True)
            country_dfs.append(df)
            
    country_df = pd.concat(country_dfs)
    country_df = country_df.sort_values(by=["Date", "HomeTeam"], ignore_index=True)
    file_name = country + "_data.csv"
    country_df.to_csv("data/" + file_name)

In [4]:
total_dfs = [] # all leagues in one dataframe
for file in file_names:
    df = pd.read_csv("data/" + file, error_bad_lines=False, warn_bad_lines=False, encoding='latin1')
    df["Date"] = pd.to_datetime(df["Date"], dayfirst=True)
    total_dfs.append(df)

total_df = pd.concat(total_dfs)
total_df = total_df.sort_values(by=["Date", "HomeTeam"], ignore_index=True)
file_name = "all_data.csv"
total_df.to_csv("data/" + file_name)