## Imports

In [1]:
import warnings
import pandas as pd
import factor

warnings.filterwarnings(
    "ignore", message="Passing literal html to 'read_html' is deprecated"
)
pd.options.mode.chained_assignment = None  # default='warn'

## Provide Links to Scrape

In [2]:
urls = [
    [
        f"https://fbref.com/en/comps/230/2022-2023/{category}/2022-2023-Liga-F-Stats"
        for category in factor.CATEGORIES
    ],
    [
        f"https://fbref.com/en/comps/182/2022/{category}/2022-NWSL-Stats"
        for category in factor.CATEGORIES
    ],
    [
        f"https://fbref.com/en/comps/189/2022-2023/{category}/2022-2023-Womens-Super-League-Stats"
        for category in factor.CATEGORIES
    ],
    [
        f"https://fbref.com/en/comps/189/2021-2022/{category}/2021-2022-Womens-Super-League-Stats"
        for category in factor.CATEGORIES
    ],
]

In [3]:
url_dict = {}
for season, url_list in zip(
    ["2022-2023 Liga", "2022 NWSL", "2022-2023 WSL", "2021-2022 WSL"], urls
):
    url_dict[season] = url_list

## Perform Scraping and Tidying

In [4]:
dataframes = []
data = []
for season, urls in url_dict.items():
    print("===============================================================")
    for url, col in zip(urls, factor.COLUMNS):
        try:
            print(f"Scraping data for {season} - {col}")
            df = factor.scrape_and_process(url, factor.COLUMNS[col], season)
            df = factor.tidy_columns(df, col)
            dataframes.append(df)
        except Exception as e:
            print(f"Error scraping data for {season} - {col}")
            print(e)
    data.append(pd.concat(dataframes, axis=1))
    dataframes = []

Scraping data for 2022-2023 Liga - shooting
Scraping data for 2022-2023 Liga - passing
Scraping data for 2022-2023 Liga - pass_types
Scraping data for 2022-2023 Liga - goal_and_shot_creation
Scraping data for 2022-2023 Liga - defensive_actions
Scraping data for 2022-2023 Liga - possession
Scraping data for 2022-2023 Liga - miscellaneous
Scraping data for 2022 NWSL - shooting
Scraping data for 2022 NWSL - passing
Scraping data for 2022 NWSL - pass_types
Scraping data for 2022 NWSL - goal_and_shot_creation
Scraping data for 2022 NWSL - defensive_actions
Scraping data for 2022 NWSL - possession
Scraping data for 2022 NWSL - miscellaneous
Scraping data for 2022-2023 WSL - shooting
Scraping data for 2022-2023 WSL - passing
Scraping data for 2022-2023 WSL - pass_types
Scraping data for 2022-2023 WSL - goal_and_shot_creation
Scraping data for 2022-2023 WSL - defensive_actions
Scraping data for 2022-2023 WSL - possession
Scraping data for 2022-2023 WSL - miscellaneous
Scraping data for 2021-20

## Join All Seasons Data

In [5]:
df_merged = pd.concat(data)
df_merged = df_merged.transpose().drop_duplicates().transpose().reset_index(drop=True)

In [6]:
# df_merged.info()

In [8]:
display(df_merged.head(2))

Unnamed: 0,SQUAD,SHOOT_SH,SHOOT_DIST,SEASON,PASS_TOTDIST,PASS_PRGDIST,PASS_ATT,PASS_ATT1,PASS_ATT2,PASS_ATT3,...,DA_TKL,DA_TKL1,DA_ATT,DA_BLOCKS,DA_INT,POSS_POSS,MISC_CRDY,MISC_CRDR,MISC_FLS,MISC_RECOV
0,Alavés,286,18.4,2022-2023 Liga,141213,66482,10702,3176,4315,2619,...,491,198,381,322,330,44.6,59,1,311,1575
1,Alhama,304,20.8,2022-2023 Liga,99966,56452,9076,2552,3199,2796,...,746,313,630,420,461,39.6,61,4,383,1782


In [9]:
len(df_merged)

52

In [None]:
df_merged.to_csv("data/pro_leagues.csv")