In [9]:
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd
import time
import random

def get_soup(url):
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0.0.0 Safari/537.36"
        ),
        "Accept": (
            "text/html,application/xhtml+xml,application/xml;"
            "q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"
        ),
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        
        # critical: these are what requests is missing
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-User": "?1",
        "Sec-Fetch-Dest": "document",

        # also critical: Chrome client hints
        "Sec-Ch-Ua": (
            '"Chromium";v="120", "Not(A:Brand";v="24", "Google Chrome";v="120"'
        ),
        "Sec-Ch-Ua-Mobile": "?0",
        "Sec-Ch-Ua-Platform": '"Windows"',
    }

    session = requests.Session()
    resp = session.get(url, headers=headers)
    
    if resp.status_code == 403:
        print("403 received — retrying after 20–40 seconds...")
        time.sleep(random.uniform(20, 40))
        resp = session.get(url, headers=headers)

    resp.raise_for_status()
    time.sleep(random.uniform(4, 7))  # anti-block delay

    return BeautifulSoup(resp.text, "html.parser")


In [2]:
def extract_table(soup, table_id):
    table = soup.find("table", id=table_id)
    if table:
        return table

    # Search inside HTML comments
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    for c in comments:
        comment_soup = BeautifulSoup(c, "html.parser")
        table = comment_soup.find("table", id=table_id)
        if table:
            return table

    return None


In [3]:
def table_to_df(table):
    if table is None:
        return None

    df = pd.read_html(str(table))[0]

    # Flatten MultiIndex columns
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [
            '_'.join([str(c) for c in col if c not in ["", " "]]).strip()
            for col in df.columns.values
        ]

    # Remove "Unnamed" columns
    df = df.loc[:, ~df.columns.str.contains("Unnamed")]

    return df


In [4]:
def get_premier_league_team_stats():
    url = "https://fbref.com/en/comps/9/Premier-League-Stats"
    soup = get_soup(url)

    tables = {
        "standard": "stats_squads_standard_for",
        "xg": "stats_squads_xg_for",
        "shooting": "stats_squads_shooting_for",
        "passing": "stats_squads_passing_for",
        "passing_types": "stats_squads_passing_types_for",
        "defense": "stats_squads_defense_for",
        "possession": "stats_squads_possession_for",
        "keeper": "stats_squads_keeper_for",
        "misc": "stats_squads_misc_for",
    }

    dfs = {}
    for name, table_id in tables.items():
        table = extract_table(soup, table_id)
        df = table_to_df(table)
        dfs[name] = df

    return dfs


In [28]:
dfs = get_premier_league_team_stats()

  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]


In [36]:
dfs['standard']

Unnamed: 0,Playing Time_MP,Playing Time_Starts,Playing Time_Min,Playing Time_90s,Performance_Gls,Performance_Ast,Performance_G+A,Performance_G-PK,Performance_PK,Performance_PKatt,...,Per 90 Minutes_Gls,Per 90 Minutes_Ast,Per 90 Minutes_G+A,Per 90 Minutes_G-PK,Per 90 Minutes_G+A-PK,Per 90 Minutes_xG,Per 90 Minutes_xAG,Per 90 Minutes_xG+xAG,Per 90 Minutes_npxG,Per 90 Minutes_npxG+xAG
0,12,132,1080,12.0,24,18,42,22,2,2,...,2.0,1.5,3.5,1.83,3.33,1.73,1.1,2.82,1.59,2.69
1,12,132,1080,12.0,15,11,26,15,0,0,...,1.25,0.92,2.17,1.25,2.17,0.89,0.71,1.6,0.89,1.6
2,12,132,1080,12.0,19,12,31,17,2,3,...,1.58,1.0,2.58,1.42,2.42,1.43,0.82,2.25,1.23,2.05
3,12,132,1080,12.0,18,8,26,14,4,6,...,1.5,0.67,2.17,1.17,1.83,1.52,0.86,2.38,1.13,1.99
4,12,132,1080,12.0,19,12,31,17,2,3,...,1.58,1.0,2.58,1.42,2.42,1.48,1.01,2.5,1.29,2.31
5,12,132,1080,12.0,14,11,25,14,0,0,...,1.17,0.92,2.08,1.17,2.08,0.71,0.56,1.27,0.71,1.27
6,12,132,1080,12.0,23,17,40,22,1,1,...,1.92,1.42,3.33,1.83,3.25,1.84,1.48,3.32,1.77,3.25
7,12,132,1080,12.0,15,8,23,13,2,2,...,1.25,0.67,1.92,1.08,1.75,1.69,1.11,2.8,1.56,2.68
8,11,121,990,11.0,12,9,21,11,1,1,...,1.09,0.82,1.91,1.0,1.82,1.34,1.02,2.36,1.27,2.29
9,12,132,1080,12.0,10,7,17,10,0,0,...,0.83,0.58,1.42,0.83,1.42,1.07,0.81,1.88,1.07,1.88


In [34]:
dfs['passing']

Unnamed: 0,Total_Cmp,Total_Att,Total_Cmp%,Total_TotDist,Total_PrgDist,Short_Cmp,Short_Att,Short_Cmp%,Medium_Cmp,Medium_Att,Medium_Cmp%,Long_Cmp,Long_Att,Long_Cmp%,Expected_xA,Expected_A-xAG
0,5344,6457,82.8,93343,28674,2347,2602,90.2,2511,2818,89.1,396,776,51.0,14.0,4.9
1,4677,5705,82.0,82817,28252,2071,2269,91.3,2095,2365,88.6,420,808,52.0,8.6,2.5
2,4581,5834,78.5,79533,27187,2075,2339,88.7,1972,2280,86.5,416,882,47.2,9.7,2.2
3,3520,4706,74.8,65377,25511,1478,1733,85.3,1536,1848,83.1,415,885,46.9,8.8,-2.3
4,4665,5744,81.2,81630,27850,2137,2367,90.3,2025,2312,87.6,404,765,52.8,9.5,-0.1
5,3529,4665,75.6,60928,22745,1729,1944,88.9,1345,1615,83.3,332,777,42.7,6.8,4.3
6,5810,6886,84.4,99380,30616,2727,2968,91.9,2571,2851,90.2,415,812,51.1,14.5,-0.7
7,3636,4825,75.4,66840,25750,1527,1762,86.7,1658,1944,85.3,387,867,44.6,11.5,-5.4
8,3601,4676,77.0,65972,23831,1548,1762,87.9,1622,1912,84.8,365,767,47.6,10.8,-2.2
9,4693,5821,80.6,83625,28568,2013,2253,89.3,2139,2426,88.2,450,854,52.7,9.3,-2.7


In [8]:
def get_premier_league_matches():
    url = "https://fbref.com/en/comps/9/2023-2024/schedule/2023-2024-Premier-League-Scores-and-Fixtures"
    soup = get_soup(url)

    # FBref table ID for match logs
    table_id = "sched_2023-2024_9_1"

    table = extract_table(soup, table_id)
    if table is None:
        raise ValueError("Match table not found – FBref may have changed structure.")

    df = pd.read_html(str(table))[0]

    # Remove repeated headers
    df = df[df["Date"] != "Date"]

    # Clean numeric columns
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="ignore")

    return df


In [10]:
matches_df = get_premier_league_matches()

403 received — retrying after 20–40 seconds...


HTTPError: 403 Client Error: Forbidden for url: https://fbref.com/en/comps/9/2023-2024/schedule/2023-2024-Premier-League-Scores-and-Fixtures

In [49]:
matches_df.drop(columns=['Attendance', 'Referee', 'Match Report', 'Notes', 'Venue', 'Time', 'Day'], axis=1, inplace=True)

KeyError: "['Attendance', 'Referee', 'Match Report', 'Notes', 'Venue'] not found in axis"

In [88]:
currmatches = pd.read_csv('../data/premier_league_matches_2025_2026.csv')

In [89]:
currmatches["Date"] = pd.to_datetime(currmatches["Date"])

In [90]:
currmatches = currmatches[currmatches["Date"] < pd.Timestamp.today() - pd.Timedelta(days=1)]

In [91]:
currmatches

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away
0,1.0,Fri,2025-08-15,20:00,Liverpool,2.2,4–2,1.7,Bournemouth
1,1.0,Sat,2025-08-16,12:30,Aston Villa,0.2,0–0,1.4,Newcastle Utd
2,1.0,Sat,2025-08-16,15:00,Sunderland,0.7,3–0,0.6,West Ham
3,1.0,Sat,2025-08-16,15:00,Brighton,1.5,1–1,0.7,Fulham
4,1.0,Sat,2025-08-16,15:00,Tottenham,2.3,3–0,0.9,Burnley
...,...,...,...,...,...,...,...,...,...
125,12.0,Sat,2025-11-22,15:00,Liverpool,1.9,0–3,1.6,Nott'ham Forest
126,12.0,Sat,2025-11-22,15:00,Brighton,1.2,2–1,2.0,Brentford
127,12.0,Sat,2025-11-22,17:30,Newcastle Utd,2.3,2–1,2.5,Manchester City
128,12.0,Sun,2025-11-23,14:00,Leeds United,1.6,1–2,1.6,Aston Villa


In [92]:
import pandas as pd

def clean_match_df(df):
    # Split score "4–2" into home_goals, away_goals
    df["home_goals"] = df["Score"].str.extract(r"(\d+)[–-](\d+)")[0].astype(int)
    df["away_goals"] = df["Score"].str.extract(r"(\d+)[–-](\d+)")[1].astype(int)

    # Rename xG columns
    df = df.rename(columns={
        "Home": "home_team",
        "Away": "away_team",
        "xG": "home_xg",
        "xG.1": "away_xg"
    })

    # Create label: win/draw/loss from home perspective
    df["result"] = df.apply(
        lambda row: 
            "W" if row.home_goals > row.away_goals
            else "L" if row.home_goals < row.away_goals
            else "D",
        axis=1
    )

    return df


In [93]:
clean_matches = clean_match_df(currmatches)
clean_matches

Unnamed: 0,Wk,Day,Date,Time,home_team,home_xg,Score,away_xg,away_team,home_goals,away_goals,result
0,1.0,Fri,2025-08-15,20:00,Liverpool,2.2,4–2,1.7,Bournemouth,4,2,W
1,1.0,Sat,2025-08-16,12:30,Aston Villa,0.2,0–0,1.4,Newcastle Utd,0,0,D
2,1.0,Sat,2025-08-16,15:00,Sunderland,0.7,3–0,0.6,West Ham,3,0,W
3,1.0,Sat,2025-08-16,15:00,Brighton,1.5,1–1,0.7,Fulham,1,1,D
4,1.0,Sat,2025-08-16,15:00,Tottenham,2.3,3–0,0.9,Burnley,3,0,W
...,...,...,...,...,...,...,...,...,...,...,...,...
125,12.0,Sat,2025-11-22,15:00,Liverpool,1.9,0–3,1.6,Nott'ham Forest,0,3,L
126,12.0,Sat,2025-11-22,15:00,Brighton,1.2,2–1,2.0,Brentford,2,1,W
127,12.0,Sat,2025-11-22,17:30,Newcastle Utd,2.3,2–1,2.5,Manchester City,2,1,W
128,12.0,Sun,2025-11-23,14:00,Leeds United,1.6,1–2,1.6,Aston Villa,1,2,L


In [94]:
def add_rolling_features(df, window=5):

    df = df.sort_values("Date")

    # Team-based dictionaries for rolling stats
    team_stats = {}

    home_stats = []
    away_stats = []

    for _, row in df.iterrows():
        home = row.home_team
        away = row.away_team

        # Initialize if needed
        for t in [home, away]:
            if t not in team_stats:
                team_stats[t] = {
                    "gf": [], "ga": [], "pts": []
                }

        # Compute home rolling
        home_form = team_stats[home]
        away_form = team_stats[away]

        home_stats.append({
            "home_avg_gf_5": pd.Series(home_form["gf"][-window:]).mean(),
            "home_avg_ga_5": pd.Series(home_form["ga"][-window:]).mean(),
            "home_avg_pts_5": pd.Series(home_form["pts"][-window:]).mean(),
            "home_form_score": sum(home_form["pts"][-window:])
        })

        # Compute away rolling
        away_stats.append({
            "away_avg_gf_5": pd.Series(away_form["gf"][-window:]).mean(),
            "away_avg_ga_5": pd.Series(away_form["ga"][-window:]).mean(),
            "away_avg_pts_5": pd.Series(away_form["pts"][-window:]).mean(),
            "away_form_score": sum(away_form["pts"][-window:])
        })

        # Update team stats after computing features
        home_pts = 3 if row.home_goals > row.away_goals else 1 if row.home_goals == row.away_goals else 0
        away_pts = 3 if row.away_goals > row.home_goals else 1 if row.home_goals == row.away_goals else 0

        home_form["gf"].append(row.home_goals)
        home_form["ga"].append(row.away_goals)
        home_form["pts"].append(home_pts)

        away_form["gf"].append(row.away_goals)
        away_form["ga"].append(row.home_goals)
        away_form["pts"].append(away_pts)

    home_df = pd.DataFrame(home_stats)
    away_df = pd.DataFrame(away_stats)

    return pd.concat([df.reset_index(drop=True), home_df, away_df], axis=1)


In [95]:
currmatches = add_rolling_features(clean_matches, window=5)
currmatches

Unnamed: 0,Wk,Day,Date,Time,home_team,home_xg,Score,away_xg,away_team,home_goals,away_goals,result,home_avg_gf_5,home_avg_ga_5,home_avg_pts_5,home_form_score,away_avg_gf_5,away_avg_ga_5,away_avg_pts_5,away_form_score
0,1.0,Fri,2025-08-15,20:00,Liverpool,2.2,4–2,1.7,Bournemouth,4,2,W,,,,0,,,,0
1,1.0,Sat,2025-08-16,12:30,Aston Villa,0.2,0–0,1.4,Newcastle Utd,0,0,D,,,,0,,,,0
2,1.0,Sat,2025-08-16,15:00,Sunderland,0.7,3–0,0.6,West Ham,3,0,W,,,,0,,,,0
3,1.0,Sat,2025-08-16,15:00,Brighton,1.5,1–1,0.7,Fulham,1,1,D,,,,0,,,,0
4,1.0,Sat,2025-08-16,15:00,Tottenham,2.3,3–0,0.9,Burnley,3,0,W,,,,0,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,12.0,Sat,2025-11-22,15:00,Liverpool,1.9,0–3,1.6,Nott'ham Forest,0,3,L,1.2,2.0,0.6,3,1.0,2.0,0.8,4
115,12.0,Sat,2025-11-22,15:00,Brighton,1.2,2–1,2.0,Brentford,2,1,W,1.6,1.2,1.6,8,1.6,1.2,1.8,9
116,12.0,Sat,2025-11-22,17:30,Newcastle Utd,2.3,2–1,2.5,Manchester City,2,1,W,1.4,1.8,1.2,6,1.8,0.4,2.4,12
117,12.0,Sun,2025-11-23,14:00,Leeds United,1.6,1–2,1.6,Aston Villa,1,2,L,0.8,2.2,0.6,3,1.8,0.8,2.4,12


In [96]:
def update_elo(df, base_elo=1500, k=20):

    elos = {}
    home_elos = []
    away_elos = []

    for _, row in df.iterrows():
        h = row.home_team
        a = row.away_team

        if h not in elos: elos[h] = base_elo
        if a not in elos: elos[a] = base_elo

        home_elos.append(elos[h])
        away_elos.append(elos[a])

        # Determine actual score
        if row.home_goals > row.away_goals:
            score_h, score_a = 1, 0
        elif row.home_goals < row.away_goals:
            score_h, score_a = 0, 1
        else:
            score_h, score_a = 0.5, 0.5

        # Expected score
        expected_h = 1 / (1 + 10 ** ((elos[a] - elos[h]) / 400))
        expected_a = 1 - expected_h

        # Update
        elos[h] += k * (score_h - expected_h)
        elos[a] += k * (score_a - expected_a)

    df["home_elo"] = home_elos
    df["away_elo"] = away_elos
    return df


In [97]:
currmatches = update_elo(currmatches)
currmatches

Unnamed: 0,Wk,Day,Date,Time,home_team,home_xg,Score,away_xg,away_team,home_goals,...,home_avg_gf_5,home_avg_ga_5,home_avg_pts_5,home_form_score,away_avg_gf_5,away_avg_ga_5,away_avg_pts_5,away_form_score,home_elo,away_elo
0,1.0,Fri,2025-08-15,20:00,Liverpool,2.2,4–2,1.7,Bournemouth,4,...,,,,0,,,,0,1500.000000,1500.000000
1,1.0,Sat,2025-08-16,12:30,Aston Villa,0.2,0–0,1.4,Newcastle Utd,0,...,,,,0,,,,0,1500.000000,1500.000000
2,1.0,Sat,2025-08-16,15:00,Sunderland,0.7,3–0,0.6,West Ham,3,...,,,,0,,,,0,1500.000000,1500.000000
3,1.0,Sat,2025-08-16,15:00,Brighton,1.5,1–1,0.7,Fulham,1,...,,,,0,,,,0,1500.000000,1500.000000
4,1.0,Sat,2025-08-16,15:00,Tottenham,2.3,3–0,0.9,Burnley,3,...,,,,0,,,,0,1500.000000,1500.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,12.0,Sat,2025-11-22,15:00,Liverpool,1.9,0–3,1.6,Nott'ham Forest,0,...,1.2,2.0,0.6,3,1.0,2.0,0.8,4,1505.360738,1465.526210
115,12.0,Sat,2025-11-22,15:00,Brighton,1.2,2–1,2.0,Brentford,2,...,1.6,1.2,1.6,8,1.6,1.2,1.8,9,1510.250649,1502.713825
116,12.0,Sat,2025-11-22,17:30,Newcastle Utd,2.3,2–1,2.5,Manchester City,2,...,1.4,1.8,1.2,6,1.8,0.4,2.4,12,1479.040137,1538.852471
117,12.0,Sun,2025-11-23,14:00,Leeds United,1.6,1–2,1.6,Aston Villa,1,...,0.8,2.2,0.6,3,1.8,0.8,2.4,12,1467.932302,1522.926536


In [98]:
currmatches.drop(columns=['Score', 'Wk', 'Day', 'Time', 'Score', 'home_goals', 'away_goals', ], axis=1, inplace=True)

In [99]:
currmatches = pd.get_dummies(currmatches, columns=["home_team", "away_team"], drop_first=True)

In [100]:
rolling_cols = [
    "home_avg_gf_5", "home_avg_ga_5", "home_avg_pts_5", "home_form_score",
    "away_avg_gf_5", "away_avg_ga_5", "away_avg_pts_5", "away_form_score"
]
currmatches[rolling_cols] = currmatches[rolling_cols].fillna(currmatches[rolling_cols].mean())

In [101]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
currmatches['result'] = le.fit_transform(currmatches['result']) # W=2, D=0, L=1

In [102]:
currmatches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 0 to 118
Data columns (total 52 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       119 non-null    datetime64[ns]
 1   home_xg                    119 non-null    float64       
 2   away_xg                    119 non-null    float64       
 3   result                     119 non-null    int64         
 4   home_avg_gf_5              119 non-null    float64       
 5   home_avg_ga_5              119 non-null    float64       
 6   home_avg_pts_5             119 non-null    float64       
 7   home_form_score            119 non-null    int64         
 8   away_avg_gf_5              119 non-null    float64       
 9   away_avg_ga_5              119 non-null    float64       
 10  away_avg_pts_5             119 non-null    float64       
 11  away_form_score            119 non-null    int64         
 12  home_elo

In [103]:
currmatches.to_csv('../data/currmatches.csv', index=False)