In [1]:
import re
import requests
import warnings

import pandas as pd
from bs4 import BeautifulSoup
warnings.filterwarnings("ignore", message="Passing literal html to 'read_html' is deprecated")

In [2]:
def scrape_and_process(url, columns):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table')
    table_data = pd.read_html(str(table))[0]
    table_data.columns = table_data.columns.droplevel([0])
    selected_data = table_data[columns]
    return selected_data

In [3]:
columns = {
    "shooting": ["Squad", "Sh", "Dist"],
    "passing": ["Squad", "TotDist", "PrgDist", "Att", "KP"],
    "pass_types": ["Squad", "Crs", "CK"],
    "goal_and_shot_creation": ["Squad", "PassLive", "PassDead", "TO", "Sh", "Fld", "Def"],
    "defensive_actions": ["Squad", "Tkl", "Att", "Blocks", "Int"],
    "possession": ["Squad", "Poss"],
    "miscellaneous": ["Squad", "CrdY", "CrdR", "Fls", "Recov"]
}

In [4]:
categories = ["shooting", "passing", "passing_types", "gca", "defense", "possession", "misc"]
urls = [
    [f"https://fbref.com/en/comps/230/2022-2023/{category}/2022-2023-Liga-F-Stats" for category in categories],
    [f"https://fbref.com/en/comps/182/2023/{category}/2023-NWSL-Stats" for category in categories],
    [f"https://fbref.com/en/comps/182/2022/{category}/2022-NWSL-Stats" for category in categories],
    [f"https://fbref.com/en/comps/189/2022-2023/{category}/2022-2023-Womens-Super-League-Stats" for category in categories],
    [f"https://fbref.com/en/comps/189/2021-2022/{category}/2021-2022-Womens-Super-League-Stats" for category in categories]
]

In [5]:
dataframes = []
for season in urls[:3]:
    for category_link, category_columns in zip(season, columns):
        print(f"{category_link} {category_columns}")
        try:
            dataframes.append(scrape_and_process(category_link, columns[category_columns]))
        except Exception as e:
            print("DID NOT WORK")
            print(e)

https://fbref.com/en/comps/230/2022-2023/shooting/2022-2023-Liga-F-Stats shooting


https://fbref.com/en/comps/230/2022-2023/passing/2022-2023-Liga-F-Stats passing
https://fbref.com/en/comps/230/2022-2023/passing_types/2022-2023-Liga-F-Stats pass_types
https://fbref.com/en/comps/230/2022-2023/gca/2022-2023-Liga-F-Stats goal_and_shot_creation
https://fbref.com/en/comps/230/2022-2023/defense/2022-2023-Liga-F-Stats defensive_actions
https://fbref.com/en/comps/230/2022-2023/possession/2022-2023-Liga-F-Stats possession
https://fbref.com/en/comps/230/2022-2023/misc/2022-2023-Liga-F-Stats miscellaneous
https://fbref.com/en/comps/182/2023/shooting/2023-NWSL-Stats shooting
https://fbref.com/en/comps/182/2023/passing/2023-NWSL-Stats passing
https://fbref.com/en/comps/182/2023/passing_types/2023-NWSL-Stats pass_types
https://fbref.com/en/comps/182/2023/gca/2023-NWSL-Stats goal_and_shot_creation
https://fbref.com/en/comps/182/2023/defense/2023-NWSL-Stats defensive_actions
https://fbref.com/en/comps/182/2023/possession/2023-NWSL-Stats possession
https://fbref.com/en/comps/182/2023

In [6]:
for season in urls[3:]:
    for category_link, category_columns in zip(season, columns):
        print(f"{category_link} {category_columns}")
        try:
            dataframes.append(scrape_and_process(category_link, columns[category_columns]))
        except Exception as e:
            print("DID NOT WORK")
            print(e)

https://fbref.com/en/comps/189/2022-2023/shooting/2022-2023-Womens-Super-League-Stats shooting
https://fbref.com/en/comps/189/2022-2023/passing/2022-2023-Womens-Super-League-Stats passing
https://fbref.com/en/comps/189/2022-2023/passing_types/2022-2023-Womens-Super-League-Stats pass_types
https://fbref.com/en/comps/189/2022-2023/gca/2022-2023-Womens-Super-League-Stats goal_and_shot_creation
https://fbref.com/en/comps/189/2022-2023/defense/2022-2023-Womens-Super-League-Stats defensive_actions
https://fbref.com/en/comps/189/2022-2023/possession/2022-2023-Womens-Super-League-Stats possession
https://fbref.com/en/comps/189/2022-2023/misc/2022-2023-Womens-Super-League-Stats miscellaneous
https://fbref.com/en/comps/189/2021-2022/shooting/2021-2022-Womens-Super-League-Stats shooting
https://fbref.com/en/comps/189/2021-2022/passing/2021-2022-Womens-Super-League-Stats passing
https://fbref.com/en/comps/189/2021-2022/passing_types/2021-2022-Womens-Super-League-Stats pass_types
https://fbref.com/

In [8]:
df_merged = pd.concat(dataframes, keys=["Shooting", "Passing", "Pass Types", "Goal and Shot Creation", "Defensive Actions", "Possession", "Miscellaneous"], axis=1)
df = df_merged.transpose().drop_duplicates().transpose()
simple_columns = {
    "Shooting": "SHOOT",
    "Passing": "PASS",
    "Pass Types": "PT",
    "Goal and Shot Creation": "GSC",
    "Defensive Actions": "DA",
    "Possession": "POSS",
    "Miscellaneous": "MISC"
}

new_level_0 = [simple_columns.get(col, col) for col in df.columns.levels[0]]
df.columns = df.columns.set_levels(new_level_0, level=0)
new_columns = [f"{level1}_{level0}" for level0, level1 in df.columns]
df.columns = new_columns
df.columns = df.columns.str.upper()
df.rename(columns={'SQUAD_SHOOT': 'SQUAD'}, inplace=True)

  df_merged = pd.concat(dataframes, keys=["Shooting", "Passing", "Pass Types", "Goal and Shot Creation", "Defensive Actions", "Possession", "Miscellaneous"], axis=1)


In [10]:
df.to_csv("pro_leagues.csv")

In [9]:
display(df)

Unnamed: 0,SQUAD,SH_SHOOT,DIST_SHOOT,TOTDIST_PASS,PRGDIST_PASS,ATT_PASS,ATT_PASS.1,ATT_PASS.2,ATT_PASS.3,KP_PASS,...,TKL_DA,TKL_DA.1,ATT_DA,BLOCKS_DA,INT_DA,POSS_POSS,CRDY_MISC,CRDR_MISC,FLS_MISC,RECOV_MISC
0,Alavés,286,18.3,140944,66028,10694,3170,4308,2614,186,...,490,198,380,321,328,44.6,59,1,311,1575
1,Alhama,304,20.8,99993,56481,9075,2552,3198,2795,183,...,746,312,629,420,461,39.6,61,4,383,1782
2,Athletic Club,325,20.4,167498,71664,12317,3776,5016,2882,201,...,573,225,500,330,375,47.4,37,1,279,1908
3,Atlético Madrid,387,17.5,223757,88648,15037,5271,6565,2505,276,...,506,162,320,304,371,56.3,44,0,282,1808
4,Barcelona,728,15.3,291968,107567,20580,9280,8124,2144,561,...,473,159,269,244,280,70.4,31,0,258,1921
5,Levante,479,17.1,225390,92602,15561,5639,6647,2508,344,...,589,233,395,343,323,60.4,34,0,291,1822
6,Levante Planas,301,21.5,139805,65995,10611,3297,4309,2494,193,...,579,211,411,308,358,44.1,64,2,361,1707
7,Madrid CFF,501,19.2,184392,80754,13822,5397,5363,2287,358,...,589,224,415,330,351,52.4,60,1,346,1943
8,Real Betis,288,20.2,143748,63925,11383,3890,4364,2353,200,...,586,283,517,295,359,46.3,51,2,306,1792
9,Real Madrid,460,17.3,247980,90907,16558,6102,7100,2593,335,...,572,212,388,323,358,61.2,35,0,283,1828
