In [1]:
import warnings

import pandas as pd

import factor 

warnings.filterwarnings(
    "ignore", message="Passing literal html to 'read_html' is deprecated"
)
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
urls = [
    [
        f"https://fbref.com/en/comps/230/2022-2023/{category}/2022-2023-Liga-F-Stats"
        for category in factor.CATEGORIES
    ],
    [
        f"https://fbref.com/en/comps/182/2022/{category}/2022-NWSL-Stats"
        for category in factor.CATEGORIES
    ],
    [
        f"https://fbref.com/en/comps/189/2022-2023/{category}/2022-2023-Womens-Super-League-Stats"
        for category in factor.CATEGORIES
    ],
    [
        f"https://fbref.com/en/comps/189/2021-2022/{category}/2021-2022-Womens-Super-League-Stats"
        for category in factor.CATEGORIES
    ],
]

In [3]:
url_dict = {}
for season, url_list in zip(
    ["2022-2023 Liga", "2022 NWSL", "2022-2023 WSL", "2021-2022 WSL"], urls
):
    url_dict[season] = url_list

In [4]:
# df = factor.scrape_and_process(
#     "https://fbref.com/en/comps/230/2022-2023/passing/2022-2023-Liga-F-Stats",
#     ["Squad", "TotDist", "PrgDist", "Att", "KP"],
#     "2022-2023 Liga",
# )

# factor.tidy_columns(df, "passing")#.reset_index(drop=True)
# # df.info()

In [5]:
dataframes = []
data = []
for season, urls in url_dict.items():
    print("===============================================================")
    for url, col in zip(urls, factor.COLUMNS):
        try:
            # print(f"Scraping data for {season} - {col}")
            df = factor.scrape_and_process(url, factor.COLUMNS[col], season)
            df = factor.tidy_columns(df, col)
            dataframes.append(df)
        except Exception as e:
            print(f"Error scraping data for {season} - {col}")
            print(e)
    data.append(pd.concat(dataframes, axis=1))
    dataframes = []



In [6]:
df_merged = pd.concat(data)
df_merged = df_merged.transpose().drop_duplicates().transpose().reset_index(drop=True)
display(df_merged.head())

Unnamed: 0,SQUAD,SHOOT_SH,SHOOT_DIST,SEASON,PASS_TOTDIST,PASS_PRGDIST,PASS_ATT,PASS_ATT1,PASS_ATT2,PASS_ATT3,...,DA_TKL,DA_TKL1,DA_ATT,DA_BLOCKS,DA_INT,POSS_POSS,MISC_CRDY,MISC_CRDR,MISC_FLS,MISC_RECOV
0,Alavés,286,18.4,2022-2023 Liga,141213,66482,10702,3176,4315,2619,...,491,198,381,322,330,44.6,59,1,311,1575
1,Alhama,304,20.8,2022-2023 Liga,99966,56452,9076,2552,3199,2796,...,746,313,630,420,461,39.6,61,4,383,1782
2,Athletic Club,325,20.4,2022-2023 Liga,167473,71668,12319,3779,5017,2881,...,573,225,500,331,375,47.4,37,1,279,1908
3,Atlético Madrid,387,17.5,2022-2023 Liga,223772,88661,15037,5270,6566,2505,...,506,162,320,304,371,56.3,44,0,282,1807
4,Barcelona,728,15.3,2022-2023 Liga,291975,107567,20582,9281,8124,2144,...,472,159,269,244,280,70.4,31,0,258,1921


In [10]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   SQUAD          52 non-null     object
 1   SHOOT_SH       52 non-null     object
 2   SHOOT_DIST     52 non-null     object
 3   SEASON         52 non-null     object
 4   PASS_TOTDIST   52 non-null     object
 5   PASS_PRGDIST   52 non-null     object
 6   PASS_ATT       52 non-null     object
 7   PASS_ATT1      52 non-null     object
 8   PASS_ATT2      52 non-null     object
 9   PASS_ATT3      52 non-null     object
 10  PASS_KP        52 non-null     object
 11  PT_CRS         52 non-null     object
 12  PT_CK          52 non-null     object
 13  GSC_PASSLIVE   52 non-null     object
 14  GSC_PASSLIVE1  52 non-null     object
 15  GSC_PASSDEAD   52 non-null     object
 16  GSC_PASSDEAD1  52 non-null     object
 17  GSC_TO         52 non-null     object
 18  GSC_TO1        52 non-null     o

In [11]:
df_merged.to_csv("data/pro_leagues.csv")

In [12]:
display(df_merged)

Unnamed: 0,SQUAD,SHOOT_SH,SHOOT_DIST,SEASON,PASS_TOTDIST,PASS_PRGDIST,PASS_ATT,PASS_ATT1,PASS_ATT2,PASS_ATT3,...,DA_TKL,DA_TKL1,DA_ATT,DA_BLOCKS,DA_INT,POSS_POSS,MISC_CRDY,MISC_CRDR,MISC_FLS,MISC_RECOV
0,Alavés,286,18.4,2022-2023 Liga,141213,66482,10702,3176,4315,2619,...,491,198,381,322,330,44.6,59,1,311,1575
1,Alhama,304,20.8,2022-2023 Liga,99966,56452,9076,2552,3199,2796,...,746,313,630,420,461,39.6,61,4,383,1782
2,Athletic Club,325,20.4,2022-2023 Liga,167473,71668,12319,3779,5017,2881,...,573,225,500,331,375,47.4,37,1,279,1908
3,Atlético Madrid,387,17.5,2022-2023 Liga,223772,88661,15037,5270,6566,2505,...,506,162,320,304,371,56.3,44,0,282,1807
4,Barcelona,728,15.3,2022-2023 Liga,291975,107567,20582,9281,8124,2144,...,472,159,269,244,280,70.4,31,0,258,1921
5,Levante,479,17.1,2022-2023 Liga,225390,92602,15561,5639,6647,2508,...,589,233,395,343,323,60.4,34,0,291,1822
6,Levante Planas,301,21.5,2022-2023 Liga,139825,66079,10611,3297,4309,2495,...,579,211,411,308,358,44.1,64,2,361,1707
7,Madrid CFF,501,19.2,2022-2023 Liga,184363,80771,13827,5400,5365,2287,...,589,224,415,330,351,52.4,60,1,346,1946
8,Real Betis,288,20.2,2022-2023 Liga,143745,63925,11382,3890,4364,2353,...,586,283,517,295,359,46.3,51,2,307,1792
9,Real Madrid,460,17.3,2022-2023 Liga,247980,90907,16558,6102,7100,2593,...,572,212,388,323,358,61.2,35,0,283,1828
