In [6]:
import warnings

import pandas as pd

import factor 

warnings.filterwarnings(
    "ignore", message="Passing literal html to 'read_html' is deprecated"
)
pd.options.mode.chained_assignment = None  # default='warn'

In [7]:
urls = [
    [
        f"https://fbref.com/en/comps/230/2022-2023/{category}/2022-2023-Liga-F-Stats"
        for category in factor.CATEGORIES
    ],
    [
        f"https://fbref.com/en/comps/182/2022/{category}/2022-NWSL-Stats"
        for category in factor.CATEGORIES
    ],
    [
        f"https://fbref.com/en/comps/189/2022-2023/{category}/2022-2023-Womens-Super-League-Stats"
        for category in factor.CATEGORIES
    ],
    [
        f"https://fbref.com/en/comps/189/2021-2022/{category}/2021-2022-Womens-Super-League-Stats"
        for category in factor.CATEGORIES
    ],
]

In [8]:
url_dict = {}
for season, url_list in zip(
    ["2022-2023 Liga", "2022 NWSL", "2022-2023 WSL", "2021-2022 WSL"], urls
):
    url_dict[season] = url_list

In [9]:
# df = factor.scrape_and_process(
#     "https://fbref.com/en/comps/230/2022-2023/passing/2022-2023-Liga-F-Stats",
#     ["Squad", "TotDist", "PrgDist", "Att", "KP"],
#     "2022-2023 Liga",
# )

# factor.tidy_columns(df, "passing")#.reset_index(drop=True)
# # df.info()

In [10]:
dataframes = []
data = []
for season, urls in url_dict.items():
    print("===============================================================")
    for url, col in zip(urls, factor.COLUMNS):
        try:
            # print(f"Scraping data for {season} - {col}")
            df = factor.scrape_and_process(url, factor.COLUMNS[col], season)
            df = factor.tidy_columns(df, col)
            dataframes.append(df)
        except Exception as e:
            print(f"Error scraping data for {season} - {col}")
            print(e)
    data.append(pd.concat(dataframes, axis=1))
    dataframes = []

Scraping data for 2022-2023 Liga - shooting
https://fbref.com/en/comps/230/2022-2023/shooting/2022-2023-Liga-F-Stats ['Squad', 'Sh', 'Dist']
Scraping data for 2022-2023 Liga - passing
https://fbref.com/en/comps/230/2022-2023/passing/2022-2023-Liga-F-Stats ['Squad', 'TotDist', 'PrgDist', 'Att', 'KP']
Scraping data for 2022-2023 Liga - pass_types
https://fbref.com/en/comps/230/2022-2023/passing_types/2022-2023-Liga-F-Stats ['Squad', 'Crs', 'CK']
Scraping data for 2022-2023 Liga - goal_and_shot_creation
https://fbref.com/en/comps/230/2022-2023/gca/2022-2023-Liga-F-Stats ['Squad', 'PassLive', 'PassDead', 'TO', 'Sh', 'Fld', 'Def']
Scraping data for 2022-2023 Liga - defensive_actions
https://fbref.com/en/comps/230/2022-2023/defense/2022-2023-Liga-F-Stats ['Squad', 'Tkl', 'Att', 'Blocks', 'Int']
Scraping data for 2022-2023 Liga - possession
https://fbref.com/en/comps/230/2022-2023/possession/2022-2023-Liga-F-Stats ['Squad', 'Poss']
Scraping data for 2022-2023 Liga - miscellaneous
https://fbre

In [11]:
df_merged = pd.concat(data)
df_merged = df_merged.transpose().drop_duplicates().transpose().reset_index(drop=True)
display(df_merged.head())

Unnamed: 0,SQUAD,SHOOT_SH,SHOOT_DIST,SEASON,PASS_TOTDIST,PASS_PRGDIST,PASS_ATT,PASS_ATT1,PASS_ATT2,PASS_ATT3,...,DA_TKL,DA_TKL1,DA_ATT,DA_BLOCKS,DA_INT,POSS_POSS,MISC_CRDY,MISC_CRDR,MISC_FLS,MISC_RECOV
0,Alavés,286,18.4,2022-2023 Liga,141213,66482,10702,3176,4315,2619,...,491,198,381,322,330,44.6,59,1,311,1575
1,Alhama,304,20.8,2022-2023 Liga,99966,56452,9076,2552,3199,2796,...,746,313,630,420,461,39.6,61,4,383,1782
2,Athletic Club,325,20.4,2022-2023 Liga,167473,71668,12319,3779,5017,2881,...,573,225,500,331,375,47.4,37,1,279,1908
3,Atlético Madrid,387,17.5,2022-2023 Liga,223772,88661,15037,5270,6566,2505,...,506,162,320,304,371,56.3,44,0,282,1807
4,Barcelona,728,15.3,2022-2023 Liga,291975,107567,20582,9281,8124,2144,...,472,159,269,244,280,70.4,31,0,258,1921


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   SQUAD       12 non-null     object
 1   MISC_CRDY   12 non-null     int64 
 2   MISC_CRDR   12 non-null     int64 
 3   MISC_FLS    12 non-null     int64 
 4   MISC_RECOV  12 non-null     int64 
 5   SEASON      12 non-null     object
dtypes: int64(4), object(2)
memory usage: 708.0+ bytes


In [13]:
df.to_csv("data/pro_leagues.csv")

In [14]:
display(df)

Unnamed: 0,SQUAD,MISC_CRDY,MISC_CRDR,MISC_FLS,MISC_RECOV,SEASON
0,Arsenal,27,1,160,1100,2021-2022 WSL
1,Aston Villa,21,0,206,1018,2021-2022 WSL
2,Birmingham City,19,1,189,978,2021-2022 WSL
3,Brighton,24,0,176,1073,2021-2022 WSL
4,Chelsea,14,1,163,1125,2021-2022 WSL
5,Everton,17,0,190,1149,2021-2022 WSL
6,Leicester City,27,1,224,998,2021-2022 WSL
7,Manchester City,13,1,160,1093,2021-2022 WSL
8,Manchester Utd,20,0,200,1104,2021-2022 WSL
9,Reading,19,0,208,947,2021-2022 WSL
