In [None]:
import re
import requests
import warnings

import pandas as pd
from bs4 import BeautifulSoup

warnings.filterwarnings(
    "ignore", message="Passing literal html to 'read_html' is deprecated"
)
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
RENAME_COLUMNS = {
    "shooting": "SHOOT",
    "passing": "PASS",
    "pass_types": "PT",
    "goal_and_shot_creation": "GSC",
    "defensive_actions": "DA",
    "possession": "POSS",
    "miscellaneous": "MISC",
}

COLUMNS = {
    "shooting": ["Squad", "Sh", "Dist"],
    "passing": ["Squad", "TotDist", "PrgDist", "Att", "KP"],
    "pass_types": ["Squad", "Crs", "CK"],
    "goal_and_shot_creation": [
        "Squad",
        "PassLive",
        "PassDead",
        "TO",
        "Sh",
        "Fld",
        "Def",
    ],
    "defensive_actions": ["Squad", "Tkl", "Att", "Blocks", "Int"],
    "possession": ["Squad", "Poss"],
    "miscellaneous": ["Squad", "CrdY", "CrdR", "Fls", "Recov"],
}

CATEGORIES = [
    "shooting",
    "passing",
    "passing_types",
    "gca",
    "defense",
    "possession",
    "misc",
]

In [None]:
def scrape_and_process(url, columns, season):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find("table")
    table_data = pd.read_html(str(table))[0]
    table_data.columns = table_data.columns.droplevel([0])
    selected_data = table_data[columns]
    selected_data.loc[:, "SEASON"] = season
    # selected_data.set_index(["Squad", "SEASON"], inplace=True)
    return selected_data

In [None]:
def tidy_columns(df, col):
    df.reset_index(drop=True, inplace=True)
    df.columns = df.columns.str.upper()
    # df = df.transpose().drop_duplicates().transpose().reset_index(drop=True)
    new_columns = [
        (
            f"{RENAME_COLUMNS[col]}_{column}"
            if column not in ["SQUAD", "SEASON"]
            else column
        )
        for column in df.columns
    ]
    df.rename(columns=dict(zip(df.columns, new_columns)), inplace=True)

    return df

In [None]:
# df = scrape_and_process("https://fbref.com/en/comps/230/2022-2023/shooting/2022-2023-Liga-F-Stats", COLUMNS["shooting"], "2022-2023 Liga")
# tidy_columns(df, "shooting")

In [None]:
urls = [
    [
        f"https://fbref.com/en/comps/230/2022-2023/{category}/2022-2023-Liga-F-Stats"
        for category in CATEGORIES
    ],
    [
        f"https://fbref.com/en/comps/182/2022/{category}/2022-NWSL-Stats"
        for category in CATEGORIES
    ],
    [
        f"https://fbref.com/en/comps/189/2022-2023/{category}/2022-2023-Womens-Super-League-Stats"
        for category in CATEGORIES
    ],
    [
        f"https://fbref.com/en/comps/189/2021-2022/{category}/2021-2022-Womens-Super-League-Stats"
        for category in CATEGORIES
    ],
]

In [None]:
url_dict = {}
for season, url_list in zip(
    ["2022-2023 Liga", "2022 NWSL", "2022-2023 WSL", "2021-2022 WSL"], urls
):
    url_dict[season] = url_list

In [None]:
dataframes = []
data = []
for season, urls in url_dict.items():
    print("===============================================================")
    for url, col in zip(urls, COLUMNS):
        try:
            print(f"Scraping data for {season} - {col}")
            print(url, COLUMNS[col])
            df = scrape_and_process(url, COLUMNS[col], season)
            df = tidy_columns(df, col)
            dataframes.append(df)
        except Exception as e:
            print(f"Error scraping data for {season} - {col}")
            print(e)
    data.append(pd.concat(dataframes, axis=1))
    dataframes = []

In [None]:
df_merged = pd.concat(data)
df_merged = df_merged.transpose().drop_duplicates().transpose().reset_index(drop=True)
display(df_merged.head())

In [None]:
df.to_csv("data/pro_leagues.csv")

In [None]:
display(df)