In [3]:
import requests
import pandas as pd
from bs4 import BeautifulSoup, Comment

# URL and headers
url = "https://fbref.com/en/comps/9/2023-2024/stats/2023-2024-Premier-League-Stats"
headers = {"User-Agent": "Mozilla/5.0"}

# Get page
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")

# Extract commented HTML tables
comments = soup.find_all(string=lambda text: isinstance(text, Comment))
for comment in comments:
    if "stats_standard" in comment:
        table_soup = BeautifulSoup(comment, "html.parser")
        break

# Read the actual table
table = table_soup.find("table", {"id": "stats_standard"})
df = pd.read_html(str(table))[0]

# Flatten MultiIndex columns if present
if isinstance(df.columns, pd.MultiIndex):
    df.columns = [' '.join(col).strip() for col in df.columns.values]

# Drop any repeated header rows
df = df[df[df.columns[0]] != df.columns[0]]

# Reset index
df.reset_index(drop=True, inplace=True)

# Save to CSV
df.to_csv("premier_league_per90_2023_2024.csv", index=False)

print("✅ CSV saved as 'premier_league_per90_2023_2024.csv'")


✅ CSV saved as 'premier_league_per90_2023_2024.csv'


In [10]:
import requests
import pandas as pd
from bs4 import BeautifulSoup, Comment

# List of target midfielders
target_players = [
    "Bruno Guimarães", "Declan Rice", "Tyrick Mitchell", "Morgan Gibbs-White", "Conor Gallagher",
    "Bruno Fernandes", "Pascal Groß", "Martin Ødegaard", "Nélson Semedo", "Vitaly Janelt",
    "Sander Berge", "James Garner", "James Ward-Prowse", "John McGinn", "Douglas Luiz",
    "Mario Lemina", "Rodri", "Gustavo Hamer", "Ryan Christie", "Dwight McNeil",
    "Tomáš Souček", "Moisés Caicedo", "Carlton Morris", "Phil Foden", "Lewis Cook",
    "Dejan Kulusevski", "Sean Longstaff", "João Palhinha", "Vinicius Souza", "João Gomes",
    "Julián Álvarez", "Kai Havertz", "Abdoulaye Doucouré", "Lucas Paquetá", "Ross Barkley"
]

# URL of the defensive stats page
url = "https://fbref.com/en/comps/9/2023-2024/defense/2023-2024-Premier-League-Stats"
headers = {"User-Agent": "Mozilla/5.0"}

# Fetch the page
resp = requests.get(url, headers=headers)
soup = BeautifulSoup(resp.content, "html.parser")

# FBref hides tables in HTML comments—find them all
comments = soup.find_all(string=lambda text: isinstance(text, Comment))

# Locate the comment that contains our defense table
table_html = None
for c in comments:
    if "id=\"stats_defense\"" in c:
        table_html = c
        break

if table_html is None:
    raise RuntimeError("Could not find defense table in page comments")

# Parse that commented-out HTML
table_soup = BeautifulSoup(table_html, "html.parser")
table = table_soup.find("table", id="stats_defense")

# Read into pandas (this will create a MultiIndex columns)
df = pd.read_html(str(table))[0]

# Flatten the MultiIndex column names if present
if isinstance(df.columns, pd.MultiIndex):
    # take the second level (the actual stat names)
    df.columns = df.columns.get_level_values(1)

# Now drop any repeated header rows that snuck in as data
df = df[df["Player"] != "Player"]

# Strip whitespace
df["Player"] = df["Player"].str.strip()

# Filter down to your list
df_sel = df[df["Player"].isin(target_players)].copy()

# (Optional) reorder rows to match your target list
df_sel["Player"] = pd.Categorical(df_sel["Player"], categories=target_players, ordered=True)
df_sel.sort_values("Player", inplace=True)

# Save to CSV
df_sel.to_csv("defensive_stats_selected_players.csv", index=False)
print("Written defensive_stats_selected_players.csv with", len(df_sel), "rows")


Written defensive_stats_selected_players.csv with 35 rows
