In [None]:
import re
import requests
import warnings

import pandas as pd
from bs4 import BeautifulSoup
warnings.filterwarnings("ignore", message="Passing literal html to 'read_html' is deprecated")

In [None]:
def scrape_and_process(url, columns, season):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table')
    table_data = pd.read_html(str(table))[0]
    table_data.columns = table_data.columns.droplevel([0])
    selected_data = table_data[columns]
    selected_data['SEASON'] = season
    return selected_data

In [None]:
columns = {
    "shooting": ["Squad", "Sh", "Dist"],
    "passing": ["Squad", "TotDist", "PrgDist", "Att", "KP"],
    "pass_types": ["Squad", "Crs", "CK"],
    "goal_and_shot_creation": ["Squad", "PassLive", "PassDead", "TO", "Sh", "Fld", "Def"],
    "defensive_actions": ["Squad", "Tkl", "Att", "Blocks", "Int"],
    "possession": ["Squad", "Poss"],
    "miscellaneous": ["Squad", "CrdY", "CrdR", "Fls", "Recov"]
}

In [None]:
categories = ["shooting", "passing", "passing_types", "gca", "defense", "possession", "misc"]
urls = [
    [f"https://fbref.com/en/comps/230/2022-2023/{category}/2022-2023-Liga-F-Stats" for category in categories],
    [f"https://fbref.com/en/comps/182/2022/{category}/2022-NWSL-Stats" for category in categories],
    [f"https://fbref.com/en/comps/189/2022-2023/{category}/2022-2023-Womens-Super-League-Stats" for category in categories],
    [f"https://fbref.com/en/comps/189/2021-2022/{category}/2021-2022-Womens-Super-League-Stats" for category in categories]
]

In [None]:
url_dict = {}
for season, url_list in zip(["2022-2023 Liga", "2022 NWSL", "2022-2023 WSL", "2021-2022 WSL"], urls):
    url_dict[season] = url_list

In [None]:
dataframes= []
for season, urls in url_dict.items():
    print("===============================================================")
    for url, category in zip(urls, columns):
        try:
            print(f"Scraping data for {season} - {category}")
            print(url, columns[category])
            dataframes.append(scrape_and_process(url, columns[category], season)) 
        except Exception as e:
            print(f"Error scraping data for {season} - {category}")
            print(e)
        

In [87]:
df_merged = pd.concat(dataframes, keys=["Shooting", "Passing", "Pass Types", "Goal and Shot Creation", "Defensive Actions", "Possession", "Miscellaneous"], axis=1)
df = df_merged.transpose().drop_duplicates().transpose()
simple_columns = {
    "Shooting": "SHOOT",
    "Passing": "PASS",
    "Pass Types": "PT",
    "Goal and Shot Creation": "GSC",
    "Defensive Actions": "DA",
    "Possession": "POSS",
    "Miscellaneous": "MISC"
}

# new_level_0 = [simple_columns.get(col, col) for col in df.columns.levels[0]]
# df.columns = df.columns.set_levels(new_level_0, level=0)
# new_columns = [f"{level1}_{level0}" for level0, level1 in df.columns]
# df.columns = new_columns
# df.columns = df.columns.str.upper()
# df.rename(columns={'SQUAD_SHOOT': 'SQUAD'}, inplace=True)

  df_merged = pd.concat(dataframes, keys=["Shooting", "Passing", "Pass Types", "Goal and Shot Creation", "Defensive Actions", "Possession", "Miscellaneous"], axis=1)


In [88]:
df

Unnamed: 0_level_0,Shooting,Shooting,Shooting,Shooting,Passing,Passing,Passing,Passing,Passing,Passing,...,Defensive Actions,Defensive Actions,Defensive Actions,Defensive Actions,Defensive Actions,Possession,Miscellaneous,Miscellaneous,Miscellaneous,Miscellaneous
Unnamed: 0_level_1,Squad,Sh,Dist,SEASON,TotDist,PrgDist,Att,Att.1,Att.2,Att.3,...,Tkl,Tkl.1,Att,Blocks,Int,Poss,CrdY,CrdR,Fls,Recov
0,Alavés,286,18.4,2022-2023 Liga,141213,66482,10702,3176,4315,2619,...,491,198,381,322,330,44.6,59,1,311,1575
1,Alhama,304,20.8,2022-2023 Liga,99966,56452,9076,2552,3199,2796,...,746,313,630,420,461,39.6,61,4,383,1782
2,Athletic Club,325,20.4,2022-2023 Liga,167473,71668,12319,3779,5017,2881,...,573,225,500,331,375,47.4,37,1,279,1908
3,Atlético Madrid,387,17.5,2022-2023 Liga,223772,88661,15037,5270,6566,2505,...,506,162,320,304,371,56.3,44,0,282,1807
4,Barcelona,728,15.3,2022-2023 Liga,291975,107567,20582,9281,8124,2144,...,472,159,269,244,280,70.4,31,0,258,1921
5,Levante,479,17.1,2022-2023 Liga,225390,92602,15561,5639,6647,2508,...,589,233,395,343,323,60.4,34,0,291,1822
6,Levante Planas,301,21.5,2022-2023 Liga,139825,66079,10611,3297,4309,2495,...,579,211,411,308,358,44.1,64,2,361,1707
7,Madrid CFF,501,19.2,2022-2023 Liga,184363,80771,13827,5400,5365,2287,...,589,224,415,330,351,52.4,60,1,346,1946
8,Real Betis,288,20.2,2022-2023 Liga,143745,63925,11382,3890,4364,2353,...,586,283,517,295,359,46.3,51,2,307,1792
9,Real Madrid,460,17.3,2022-2023 Liga,247980,90907,16558,6102,7100,2593,...,572,212,388,323,358,61.2,35,0,283,1828


In [None]:
df.to_csv("pro_leagues.csv")

In [None]:
display(df)