In [25]:
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

base_url = "https://www.football-data.co.uk/"

leagues = {
    "england": "Premier League",
    "germany": "Bundesliga 1",
    "italy": "Serie A",
    "spain": "La Liga Primera Division",
    "france": "Le Championnat",
}

# leagues = {
#     "france": "Le Championnat",
# 
# }

aliases = {
    "F1": "france_division_1",
    "E0": "england_division_1",
    "D1": "germany_division_1",
    "I1": "italy_division_1",
    "SP1": "spain_division_1",
}

file_names = []

for country, league_name in leagues.items():
    

    league_html = requests.get(base_url + country + "m.php")
    soup = BeautifulSoup(league_html.text, 'lxml')
    league_data_links = soup.find_all(lambda x : league_name in x.contents)

    links = [tag['href'] for tag in league_data_links][:22]
    print('links:', links)
    for i, league_url in enumerate(links):
        # if i < 17:
        #     continue
        # generate file name
        league_info = league_url.split(".csv")[0].split("/")[1:]
        year_str = league_info[0]
        year = year_str[:2] + "-" + year_str[2:]
        league_name = aliases[league_info[1]]
        file_name = league_name + "_" + year + ".csv"
        file_names.append(file_name)
        
        # call api and write data to file
        print('base_url:', base_url)
        print('league_url:', league_url)
        # print('\n')
        # time.sleep(1)
        league_data = requests.get(base_url + league_url)
        with open("data/" + file_name, mode='wb') as f:
            f.write(league_data.content)


In [26]:
import requests
from bs4 import BeautifulSoup
import time
import os
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

base_url = "https://www.football-data.co.uk/"

leagues = {
    "england": "Premier League",
    "germany": "Bundesliga 1",
    "italy": "Serie A",
    "spain": "La Liga Primera Division",
    "france": "Le Championnat",
}

aliases = {
    "F1": "france_division_1",
    "E0": "england_division_1",
    "D1": "germany_division_1",
    "I1": "italy_division_1",
    "SP1": "spain_division_1",
}

# 设置带重试和超时的 session
def create_session(retries=5, backoff=1, timeout=10):
    session = requests.Session()
    retry = Retry(
        total=retries,
        backoff_factor=backoff,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
        raise_on_status=False
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    session.request = lambda *args, **kwargs: requests.Session.request(
        session, *args, timeout=timeout, **kwargs
    )
    return session

session = create_session()

file_names = []

os.makedirs("data", exist_ok=True)

for country, league_name in leagues.items():
    try:
        print(f"\n处理：{country.upper()} - {league_name}")
        league_html = session.get(base_url + country + "m.php")
        soup = BeautifulSoup(league_html.text, 'lxml')
        league_data_links = soup.find_all(lambda x: league_name in x.contents)

        links = [tag['href'] for tag in league_data_links][:22]
        print('找到链接:', links)

        for league_url in links:
            league_info = league_url.split(".csv")[0].split("/")[1:]
            year_str = league_info[0]
            year = year_str[:2] + "-" + year_str[2:]
            league_code = league_info[1]

            if league_code not in aliases:
                print(f"忽略未知联盟代码：{league_code}")
                continue

            league_name_mapped = aliases[league_code]
            file_name = league_name_mapped + "_" + year + ".csv"
            file_names.append(file_name)

            full_url = base_url + league_url
            print(f"下载中：{full_url}")

            try:
                response = session.get(full_url)
                response.raise_for_status()
                with open("data/" + file_name, mode='wb') as f:
                    f.write(response.content)
                print(f"保存成功：{file_name}")
            except requests.exceptions.RequestException as e:
                print(f"⚠️  下载失败：{full_url}，原因：{e}")
            
            time.sleep(1)  # 控制频率，防止被封
    except Exception as e:
        print(f"❌ 页面处理失败 ({country}): {e}")



处理：ENGLAND - Premier League
找到链接: ['mmz4281/2425/E0.csv', 'mmz4281/2324/E0.csv', 'mmz4281/2223/E0.csv', 'mmz4281/2122/E0.csv', 'mmz4281/2021/E0.csv', 'mmz4281/1920/E0.csv', 'mmz4281/1819/E0.csv', 'mmz4281/1718/E0.csv', 'mmz4281/1617/E0.csv', 'mmz4281/1516/E0.csv', 'mmz4281/1415/E0.csv', 'mmz4281/1314/E0.csv', 'mmz4281/1213/E0.csv', 'mmz4281/1112/E0.csv', 'mmz4281/1011/E0.csv', 'mmz4281/0910/E0.csv', 'mmz4281/0809/E0.csv', 'mmz4281/0708/E0.csv', 'mmz4281/0607/E0.csv', 'mmz4281/0506/E0.csv', 'mmz4281/0405/E0.csv', 'mmz4281/0304/E0.csv']
下载中：https://www.football-data.co.uk/mmz4281/2425/E0.csv
保存成功：england_division_1_24-25.csv
下载中：https://www.football-data.co.uk/mmz4281/2324/E0.csv
保存成功：england_division_1_23-24.csv
下载中：https://www.football-data.co.uk/mmz4281/2223/E0.csv
保存成功：england_division_1_22-23.csv
下载中：https://www.football-data.co.uk/mmz4281/2122/E0.csv
保存成功：england_division_1_21-22.csv
下载中：https://www.football-data.co.uk/mmz4281/2021/E0.csv
保存成功：england_division_1_20-21.csv
下载中：http

file_names = [   
"england_division_1_03-04.csv",    
"england_division_1_04-05.csv",    
"england_division_1_05-06.csv",    
"england_division_1_06-07.csv",    
"england_division_1_07-08.csv",    
"england_division_1_08-09.csv",    
"england_division_1_09-10.csv",    
"england_division_1_10-11.csv",    
"england_division_1_11-12.csv",    
"england_division_1_12-13.csv",    
"england_division_1_13-14.csv",    
"england_division_1_14-15.csv",    
"england_division_1_15-16.csv",    
"england_division_1_16-17.csv",    
"england_division_1_17-18.csv",    
"england_division_1_18-19.csv",    
"england_division_1_19-20.csv",    
"england_division_1_20-21.csv",    
"england_division_1_21-22.csv",    
"england_division_1_22-23.csv",    
"england_division_1_23-24.csv",    
"england_division_1_24-25.csv",      
"france_division_1_03-04.csv",     
"france_division_1_04-05.csv",     
"france_division_1_05-06.csv",     
"france_division_1_06-07.csv",     
"france_division_1_07-08.csv",     
"france_division_1_08-09.csv",     
"france_division_1_09-10.csv",     
"france_division_1_10-11.csv",     
"france_division_1_11-12.csv",     
"france_division_1_12-13.csv",     
"france_division_1_13-14.csv",     
"france_division_1_14-15.csv",     
"france_division_1_15-16.csv",     
"france_division_1_16-17.csv",     
"france_division_1_17-18.csv",     
"france_division_1_18-19.csv",     
"france_division_1_19-20.csv",     
"france_division_1_20-21.csv",     
"france_division_1_21-22.csv",     
"france_division_1_22-23.csv",     
"france_division_1_23-24.csv",     
"france_division_1_24-25.csv",       
"germany_division_1_03-04.csv",    
"germany_division_1_04-05.csv",    
"germany_division_1_05-06.csv",    
"germany_division_1_06-07.csv",    
"germany_division_1_07-08.csv",    
"germany_division_1_08-09.csv",    
"germany_division_1_09-10.csv",    
"germany_division_1_10-11.csv",    
"germany_division_1_11-12.csv",    
"germany_division_1_12-13.csv",    
"germany_division_1_13-14.csv",    
"germany_division_1_14-15.csv",    
"germany_division_1_15-16.csv",    
"germany_division_1_16-17.csv",    
"germany_division_1_17-18.csv",    
"germany_division_1_18-19.csv",    
"germany_division_1_19-20.csv",    
"germany_division_1_20-21.csv",    
"germany_division_1_21-22.csv",    
"germany_division_1_22-23.csv",    
"germany_division_1_23-24.csv",    
"germany_division_1_24-25.csv",        
"italy_division_1_03-04.csv",      
"italy_division_1_04-05.csv",      
"italy_division_1_05-06.csv",      
"italy_division_1_06-07.csv",      
"italy_division_1_07-08.csv",      
"italy_division_1_08-09.csv",      
"italy_division_1_09-10.csv",      
"italy_division_1_10-11.csv",      
"italy_division_1_11-12.csv",      
"italy_division_1_12-13.csv",      
"italy_division_1_13-14.csv",      
"italy_division_1_14-15.csv",      
"italy_division_1_15-16.csv",      
"italy_division_1_16-17.csv",      
"italy_division_1_17-18.csv",      
"italy_division_1_18-19.csv",      
"italy_division_1_19-20.csv",      
"italy_division_1_20-21.csv",      
"italy_division_1_21-22.csv",      
"italy_division_1_22-23.csv",      
"italy_division_1_23-24.csv",      
"italy_division_1_24-25.csv",  
"spain_division_1_03-04.csv",      
"spain_division_1_04-05.csv",      
"spain_division_1_05-06.csv",      
"spain_division_1_06-07.csv",      
"spain_division_1_07-08.csv",      
"spain_division_1_08-09.csv",      
"spain_division_1_09-10.csv",      
"spain_division_1_10-11.csv",      
"spain_division_1_11-12.csv",      
"spain_division_1_12-13.csv",      
"spain_division_1_13-14.csv",      
"spain_division_1_14-15.csv",      
"spain_division_1_15-16.csv",      
"spain_division_1_16-17.csv",      
"spain_division_1_17-18.csv",      
"spain_division_1_18-19.csv",      
"spain_division_1_19-20.csv",      
"spain_division_1_20-21.csv",      
"spain_division_1_21-22.csv",      
"spain_division_1_22-23.csv",      
"spain_division_1_23-24.csv",      
"spain_division_1_24-25.csv",      
]

In [27]:
for country in leagues.keys():
    country_dfs = [] # league-specific dataframe
    for file in file_names:
        if country in file:
            # df = pd.read_csv("data/" + file, on_bad_lines='skip', encoding='latin1')
            df = pd.read_csv("data/" + file, on_bad_lines='skip', encoding='utf-8')
            df["Date"] = pd.to_datetime(df["Date"], format="mixed", dayfirst=True)
            country_dfs.append(df)
            
    country_df = pd.concat(country_dfs)
    country_df = country_df.sort_values(by=["Date", "HomeTeam"], ignore_index=True)
    file_name = country + "_data.csv"
    country_df.to_csv("data/" + file_name)

In [28]:
total_dfs = [] # all leagues in one dataframe
for file in file_names:
    # df = pd.read_csv("data/" + file, on_bad_lines='skip', encoding='latin1')
    df = pd.read_csv("data/" + file, on_bad_lines='skip', encoding='utf-8')
    df["Date"] = pd.to_datetime(df["Date"], format='mixed', dayfirst=True)
    total_dfs.append(df)

total_df = pd.concat(total_dfs)
total_df = total_df.sort_values(by=["Date", "HomeTeam"], ignore_index=True)
file_name = "all_data.csv"
total_df.to_csv("data/" + file_name)