## Imports

In [1]:
import warnings
import pandas as pd
import factor

warnings.filterwarnings(
    "ignore", message="Passing literal html to 'read_html' is deprecated"
)
pd.options.mode.chained_assignment = None  # default='warn'

## Provide Links to Scrape

In [2]:
urls = [
    [
        f"https://fbref.com/en/comps/230/2022-2023/{category}/2022-2023-Liga-F-Stats"
        for category in factor.CATEGORIES
    ],
    [
        f"https://fbref.com/en/comps/182/2022/{category}/2022-NWSL-Stats"
        for category in factor.CATEGORIES
    ],
    [
        f"https://fbref.com/en/comps/182/2023/{category}/2023-NWSL-Stats"
        for category in factor.CATEGORIES
    ],
    [
        f"https://fbref.com/en/comps/189/2022-2023/{category}/2022-2023-Womens-Super-League-Stats"
        for category in factor.CATEGORIES
    ],
    [
        f"https://fbref.com/en/comps/189/2021-2022/{category}/2021-2022-Womens-Super-League-Stats"
        for category in factor.CATEGORIES
    ],
]

In [3]:
url_dict = {}
for season, url_list in zip(
    ["2022-2023 Liga", "2022 NWSL", "2023 NWSL", "2022-2023 WSL", "2021-2022 WSL"], urls
):
    url_dict[season] = url_list

## Perform Scraping and Tidying

In [4]:
dataframes = []
data = []
for season, urls in url_dict.items():
    print("===============================================================")
    for url, col in zip(urls, factor.COLUMNS):
        try:
            print(f"Scraping data for {season} - {col}")
            df = factor.scrape_and_process(url, factor.COLUMNS[col], season)
            df = factor.tidy_columns(df, col)
            dataframes.append(df)
        except Exception as e:
            print(f"Error scraping data for {season} - {col}")
            print(e)
    data.append(pd.concat(dataframes, axis=1))
    dataframes = []

Scraping data for 2022-2023 Liga - shooting
Scraping data for 2022-2023 Liga - passing
Scraping data for 2022-2023 Liga - pass_types
Scraping data for 2022-2023 Liga - goal_and_shot_creation
Scraping data for 2022-2023 Liga - defensive_actions
Scraping data for 2022-2023 Liga - possession
Scraping data for 2022-2023 Liga - miscellaneous
Scraping data for 2022 NWSL - shooting
Scraping data for 2022 NWSL - passing
Scraping data for 2022 NWSL - pass_types
Scraping data for 2022 NWSL - goal_and_shot_creation
Scraping data for 2022 NWSL - defensive_actions
Scraping data for 2022 NWSL - possession
Scraping data for 2022 NWSL - miscellaneous
Scraping data for 2023 NWSL - shooting
Scraping data for 2023 NWSL - passing
Scraping data for 2023 NWSL - pass_types
Scraping data for 2023 NWSL - goal_and_shot_creation
Scraping data for 2023 NWSL - defensive_actions
Scraping data for 2023 NWSL - possession
Scraping data for 2023 NWSL - miscellaneous
Scraping data for 2022-2023 WSL - shooting
Scraping d

## Join All Seasons Data

In [5]:
df_merged = pd.concat(data)
df_merged = df_merged.transpose().drop_duplicates().transpose().reset_index(drop=True)

In [6]:
# df_merged.info()

In [11]:
display(df_merged.loc[lambda x: x.SEASON == "2023 NWSL"])

Unnamed: 0,SQUAD,SHOOT_SH,SHOOT_DIST,SEASON,PASS_TOTDIST,PASS_PRGDIST,PASS_ATT,PASS_ATT1,PASS_ATT2,PASS_ATT3,...,DA_TKL,DA_TKL1,DA_ATT,DA_BLOCKS,DA_INT,POSS_POSS,MISC_CRDY,MISC_CRDR,MISC_FLS,MISC_RECOV
28,Angel City,305,15.9,2023 NWSL,123133,54137,9769,3648,4071,1429,...,413,184,380,247,191,51.2,30,0,234,1499
29,Courage,248,17.0,2023 NWSL,176039,59416,12443,4901,5485,1496,...,388,146,314,253,213,59.6,23,2,207,1402
30,Current,251,17.0,2023 NWSL,127025,52179,9745,3874,3854,1495,...,351,155,357,280,224,49.3,27,0,207,1454
31,Dash,272,18.1,2023 NWSL,117374,47373,8901,3146,3545,1718,...,364,158,336,267,226,47.2,27,3,236,1375
32,Gotham FC,308,18.4,2023 NWSL,137937,54067,10487,3969,4336,1602,...,403,171,366,297,225,54.1,38,0,239,1531
33,Louisville,315,18.3,2023 NWSL,121556,52336,9305,3711,3674,1471,...,357,157,368,310,245,47.6,34,2,239,1504
34,Pride,287,17.0,2023 NWSL,119319,49294,9014,3504,3569,1422,...,453,204,420,280,199,46.4,25,1,233,1403
35,Red Stars,211,16.2,2023 NWSL,128092,53806,9727,3955,3868,1414,...,374,173,355,286,223,48.0,25,0,230,1368
36,Reign,259,17.7,2023 NWSL,136295,55311,10106,3682,4165,1726,...,399,182,398,273,254,48.5,32,1,229,1449
37,Spirit,276,17.1,2023 NWSL,110875,46724,8386,2777,3599,1560,...,399,175,386,288,235,46.6,31,2,259,1427


In [8]:
len(df_merged)

64

In [9]:
df_merged.to_csv("data/pro_leagues.csv")