In [1]:
#import libaries
import random
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from sklearn.preprocessing import LabelEncoder

In [2]:
# Base URL of fbref.com Premier League season stats
base_url = "https://fbref.com/en/comps/9/Premier-League-Stats"
seasons = [
    '2023-2024',
    '2022-2023',
    '2021-2022',
    '2020-2021',
    '2019-2020',
    '2018-2019',
    '2017-2018',
    '2016-2017',
    '2015-2016',
    '2014-2015',
]

In [3]:
def scrape_season_data(season, max_retries=3):
    url = f"{base_url}/{season}/{season}-Premier-League-Stats"  # Corrected URL format
    retries = 0
    while retries < max_retries:
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an exception for bad status codes
            print(f"Response status code for {season}: {response.status_code}")
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table', {'class': 'stats_table'})  # Check if this selector is still valid
            return table
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data for {season}: {e}")
            retries += 1
            sleep_time = random.uniform(5, 10)  # Random sleep between 5 to 10 seconds
            print(f"Retrying in {sleep_time:.2f} seconds...")
            time.sleep(sleep_time)
    print(f"Failed to fetch data for {season} after {max_retries} retries.")
    return None

In [4]:
# Initialize an empty list to store dataframes for each season
all_seasons_data = []

In [6]:
for season in seasons:
    table = scrape_season_data(season)
    if table:
        # Extract headers
        headers = [th.text.strip() for th in table.find('thead').find_all('th')]

        # Remove unnecessary headers
        if "Notes" in headers:
            headers.remove("Notes")
        if "Rk" in headers:
            headers.remove("Rk")

        rows = table.find('tbody').find_all('tr')
        season_data = [
            [td.text.strip() for td in row.find_all('td') if td.text.strip()]
            for row in rows
        ]

        # Check for length mismatch and handle it
        for row in season_data:
            if len(row) != len(headers):
                print(f"Warning: Column mismatch for season {season}. Row data length: {len(row)}, Headers length: {len(headers)}")
                # You can decide to skip this row, pad it, or take other actions based on your requirements.

        # Create and append the DataFrame
        df = pd.DataFrame(season_data, columns=headers)
        df['Season'] = season  # Add the 'Season' column
        all_seasons_data.append(df)
        print(f"Dataframe for season {season} created and appended.")
    else:
        print(f"No table found for season: {season}")


Response status code for 2023-2024: 200
Dataframe for season 2023-2024 created and appended.
Response status code for 2022-2023: 200
Dataframe for season 2022-2023 created and appended.
Response status code for 2021-2022: 200
Dataframe for season 2021-2022 created and appended.
Response status code for 2020-2021: 200
Dataframe for season 2020-2021 created and appended.
Response status code for 2019-2020: 200
Dataframe for season 2019-2020 created and appended.
Response status code for 2018-2019: 200
Dataframe for season 2018-2019 created and appended.
Response status code for 2017-2018: 200
Dataframe for season 2017-2018 created and appended.
Response status code for 2016-2017: 200
Dataframe for season 2016-2017 created and appended.
Response status code for 2015-2016: 200
Dataframe for season 2015-2016 created and appended.
Response status code for 2014-2015: 200
Dataframe for season 2014-2015 created and appended.


In [7]:
# Concatenate all season DataFrames
all_seasons_df = pd.concat(all_seasons_data, ignore_index=True)

# Encoding
all_seasons_encoded = []
for df in all_seasons_data:
    # Create a LabelEncoder object
    le = LabelEncoder()

    # Fit the encoder to the 'Squad' column and transform it
    df['Squad_Encoded'] = le.fit_transform(df['Squad'])
    # Ordinal Encoding for 'Season' column
    df['season_ordinal'] = df['Season'].apply(lambda x: int(x.split('-')[0]))  # Corrected to split by '-'

    # Append the modified DataFrame to the new list
    all_seasons_encoded.append(df)

# Save the concatenated DataFrame to a CSV file
all_seasons_df.to_csv("premierleague.csv", index=False)
print("Data saved to premierleague.csv.")

Data saved to premierleague.csv.
