In [11]:
import warnings


import pandas as pd

import factor


warnings.filterwarnings(
    "ignore", message="Passing literal html to 'read_html' is deprecated"
)

In [12]:
urls = [
    [
        f"https://fbref.com/en/comps/106/{category}/Womens-World-Cup-Stats"
        for category in factor.CATEGORIES
    ],
    [
        f"https://fbref.com/en/comps/106/2019/{category}/2019-Womens-World-Cup-Stats"
        for category in factor.CATEGORIES
    ],
]

In [13]:
url_dict = {}
for season, url_list in zip(["2023 WWC", "2019 WWC"], urls):
    url_dict[season] = url_list

In [14]:
dataframes = []
data = []
for season, urls in url_dict.items():
    print("===============================================================")
    for url, col in zip(urls, factor.COLUMNS):
        try:
            # print(f"Scraping data for {season} - {col}")
            df = factor.scrape_and_process(url, factor.COLUMNS[col], season)
            df = factor.tidy_columns(df, col)
            dataframes.append(df)
        except Exception as e:
            print(f"Error scraping data for {season} - {col}")
            print(e)
    data.append(pd.concat(dataframes, axis=1))
    dataframes = []



In [15]:
df_merged = pd.concat(data)
df_merged = df_merged.transpose().drop_duplicates().transpose().reset_index(drop=True)
display(df_merged.head())

Unnamed: 0,SQUAD,SHOOT_SH,SHOOT_DIST,SEASON,PASS_TOTDIST,PASS_PRGDIST,PASS_ATT,PASS_ATT1,PASS_ATT2,PASS_ATT3,...,DA_TKL,DA_TKL1,DA_ATT,DA_BLOCKS,DA_INT,POSS_POSS,MISC_CRDY,MISC_CRDR,MISC_FLS,MISC_RECOV
0,ar Argentina,24,21.8,2023 WWC,14741,6386,1284,577,431,190,...,73,47,94,38,42,50.0,6,0,32,204
1,au Australia,97,16.9,2023 WWC,38125,16311,3123,1254,1163,509,...,122,66,127,77,48,49.9,5,0,54,447
2,br Brazil,61,17.5,2023 WWC,23036,8223,1719,653,777,214,...,62,20,43,49,23,63.3,1,0,20,247
3,ca Canada,43,17.2,2023 WWC,21262,7750,1622,659,671,213,...,70,24,52,45,42,62.0,4,0,17,220
4,cn China PR,22,21.6,2023 WWC,12812,5548,1217,540,389,174,...,64,35,61,40,28,41.3,0,1,24,217


In [16]:
df_merged.to_csv("data/world_cups.csv")

In [17]:
df_merged

Unnamed: 0,SQUAD,SHOOT_SH,SHOOT_DIST,SEASON,PASS_TOTDIST,PASS_PRGDIST,PASS_ATT,PASS_ATT1,PASS_ATT2,PASS_ATT3,...,DA_TKL,DA_TKL1,DA_ATT,DA_BLOCKS,DA_INT,POSS_POSS,MISC_CRDY,MISC_CRDR,MISC_FLS,MISC_RECOV
0,ar Argentina,24,21.8,2023 WWC,14741,6386,1284,577,431,190,...,73,47,94,38,42,50.0,6,0,32,204
1,au Australia,97,16.9,2023 WWC,38125,16311,3123,1254,1163,509,...,122,66,127,77,48,49.9,5,0,54,447
2,br Brazil,61,17.5,2023 WWC,23036,8223,1719,653,777,214,...,62,20,43,49,23,63.3,1,0,20,247
3,ca Canada,43,17.2,2023 WWC,21262,7750,1622,659,671,213,...,70,24,52,45,42,62.0,4,0,17,220
4,cn China PR,22,21.6,2023 WWC,12812,5548,1217,540,389,174,...,64,35,61,40,28,41.3,0,1,24,217
5,co Colombia,62,20.3,2023 WWC,23138,10630,2007,861,638,371,...,133,71,131,76,51,48.2,7,0,50,378
6,cr Costa Rica,23,14.5,2023 WWC,13574,5200,1075,370,446,198,...,41,21,71,58,32,40.0,4,0,34,161
7,dk Denmark,40,17.8,2023 WWC,22768,8548,1899,726,788,258,...,90,47,102,62,41,50.5,3,0,41,286
8,eng England,95,15.2,2023 WWC,60988,22148,4321,1761,1740,591,...,152,64,133,104,67,62.0,6,1,70,471
9,fr France,93,16.8,2023 WWC,37374,14134,2808,1104,1061,458,...,130,65,110,84,53,63.8,4,0,67,406
