In [1]:
import re
import requests
import warnings

import pandas as pd
from bs4 import BeautifulSoup
warnings.filterwarnings("ignore", message="Passing literal html to 'read_html' is deprecated")


In [2]:
def scrape_and_process(url, columns):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table')
    table_data = pd.read_html(str(table))[0]
    table_data.columns = table_data.columns.droplevel([0])
    selected_data = table_data[columns]
    return selected_data

In [3]:
columns = {
    "shooting": ["Squad", "Sh", "Dist"],
    "passing": ["Squad", "TotDist", "PrgDist", "Att", "KP"],
    "pass_types": ["Squad", "Crs", "CK"],
    "goal_and_shot_creation": ["Squad", "PassLive", "PassDead", "TO", "Sh", "Fld", "Def"],
    "defensive_actions": ["Squad", "Tkl", "Att", "Blocks", "Int"],
    "possession": ["Squad", "Poss"],
    "miscellaneous": ["Squad", "CrdY", "CrdR", "Fls", "Recov"]
}

In [4]:
categories = ["shooting", "passing", "passing_types", "gca", "defense", "possession", "misc"]
urls = [
    [f"https://fbref.com/en/comps/106/{category}/Womens-World-Cup-Stats" for category in categories],
    [f"https://fbref.com/en/comps/106/2019/{category}/2019-Womens-World-Cup-Stats" for category in categories],
]

In [9]:
dataframes = []
for season in urls[:1]:
    for category_link, category_columns in zip(season, columns):
        print(f"{category_link} {category_columns}")
        try:
            dataframes.append(scrape_and_process(category_link, columns[category_columns]))
        except Exception as e:
            print("DID NOT WORK")
            print(e)

https://fbref.com/en/comps/106/shooting/Womens-World-Cup-Stats shooting


https://fbref.com/en/comps/106/passing/Womens-World-Cup-Stats passing
https://fbref.com/en/comps/106/passing_types/Womens-World-Cup-Stats pass_types
https://fbref.com/en/comps/106/gca/Womens-World-Cup-Stats goal_and_shot_creation
https://fbref.com/en/comps/106/defense/Womens-World-Cup-Stats defensive_actions
https://fbref.com/en/comps/106/possession/Womens-World-Cup-Stats possession
https://fbref.com/en/comps/106/misc/Womens-World-Cup-Stats miscellaneous


In [10]:
for season in urls[1:]:
    for category_link, category_columns in zip(season, columns):
        print(f"{category_link} {category_columns}")
        try:
            dataframes.append(scrape_and_process(category_link, columns[category_columns]))
        except Exception as e:
            print("DID NOT WORK")
            print(e)

https://fbref.com/en/comps/106/2019/shooting/2019-Womens-World-Cup-Stats shooting
https://fbref.com/en/comps/106/2019/passing/2019-Womens-World-Cup-Stats passing
https://fbref.com/en/comps/106/2019/passing_types/2019-Womens-World-Cup-Stats pass_types
https://fbref.com/en/comps/106/2019/gca/2019-Womens-World-Cup-Stats goal_and_shot_creation
https://fbref.com/en/comps/106/2019/defense/2019-Womens-World-Cup-Stats defensive_actions
https://fbref.com/en/comps/106/2019/possession/2019-Womens-World-Cup-Stats possession
https://fbref.com/en/comps/106/2019/misc/2019-Womens-World-Cup-Stats miscellaneous


In [11]:
df_merged = pd.concat(dataframes, keys=["Shooting", "Passing", "Pass Types", "Goal and Shot Creation", "Defensive Actions", "Possession", "Miscellaneous"], axis=1)
df = df_merged.transpose().drop_duplicates().transpose()

  df_merged = pd.concat(dataframes, keys=["Shooting", "Passing", "Pass Types", "Goal and Shot Creation", "Defensive Actions", "Possession", "Miscellaneous"], axis=1)


In [12]:
simple_columns = {
    "Shooting": "SHOOT",
    "Passing": "PASS",
    "Pass Types": "PT",
    "Goal and Shot Creation": "GSC",
    "Defensive Actions": "DA",
    "Possession": "POSS",
    "Miscellaneous": "MISC"
}

In [13]:
new_level_0 = [simple_columns.get(col, col) for col in df.columns.levels[0]]
df.columns = df.columns.set_levels(new_level_0, level=0)
new_columns = [f"{level1}_{level0}" for level0, level1 in df.columns]
df.columns = new_columns
df.columns = df.columns.str.upper()
df.rename(columns={'SQUAD_SHOOT': 'SQUAD'}, inplace=True)

In [14]:
df.to_csv("world_cups.csv")

In [15]:
display(df)

Unnamed: 0,SQUAD,SH_SHOOT,DIST_SHOOT,TOTDIST_PASS,PRGDIST_PASS,ATT_PASS,ATT_PASS.1,ATT_PASS.2,ATT_PASS.3,KP_PASS,...,TKL_DA,TKL_DA.1,ATT_DA,BLOCKS_DA,INT_DA,POSS_POSS,CRDY_MISC,CRDR_MISC,FLS_MISC,RECOV_MISC
0,ar Argentina,24,21.8,14741,6386,1284,577,431,190,16,...,73,47,94,38,42,50.0,6,0,32,204
1,au Australia,97,16.9,38125,16311,3123,1254,1163,509,59,...,122,66,127,77,48,49.9,5,0,54,447
2,br Brazil,61,17.5,23036,8223,1719,653,777,214,41,...,62,20,43,49,23,63.3,1,0,20,247
3,ca Canada,43,17.2,21262,7750,1622,659,671,213,33,...,70,24,52,45,42,62.0,4,0,17,220
4,cn China PR,22,21.6,12812,5548,1217,540,389,174,11,...,64,35,61,40,28,41.3,0,1,24,217
5,co Colombia,62,20.3,23138,10630,2007,861,638,371,43,...,133,71,131,76,51,48.2,7,0,50,378
6,cr Costa Rica,23,14.5,13574,5200,1075,370,446,198,18,...,41,21,71,58,32,40.0,4,0,34,161
7,dk Denmark,40,17.8,22768,8548,1899,726,788,258,26,...,90,47,102,62,41,50.5,3,0,41,286
8,eng England,95,15.2,60988,22148,4321,1761,1740,591,70,...,152,64,133,104,67,62.0,6,1,70,471
9,fr France,93,16.8,37374,14134,2808,1104,1061,458,66,...,130,65,110,84,53,63.8,4,0,67,406
