In [1]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup, Comment

url = 'https://fbref.com/en/comps/40/playingtime/Scottish-Premiership-Stats'
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
        " AppleWebKit/537.36 (KHTML, like Gecko)"
        " Chrome/115.0.0.0 Safari/537.36"
    )
}

# 1. Get the raw HTML with a browser‐like UA
res = requests.get(url, headers=headers)
res.raise_for_status()

# 2. Parse and pull out all HTML comments
soup = BeautifulSoup(res.text, 'lxml')
comments = soup.find_all(string=lambda text: isinstance(text, Comment))

# 3. Extract our table from the commented snippets
table_html = None
for c in comments:
    if 'id="stats_playing_time"' in c:
        table_soup = BeautifulSoup(c, 'lxml')
        table = table_soup.find('table', id='stats_playing_time')
        table_html = str(table)
        break

if not table_html:
    raise ValueError("Couldn't find the stats_playing_time table in the page comments")

# 4. Read into pandas, capturing both header rows
df = pd.read_html(table_html, header=[0, 1])[0]

# 5. Flatten the MultiIndex cleanly
df.columns = [
    sec if str(sec).strip() else fst
    for fst, sec in df.columns
]

print(df.head(5))


HTTPError: 403 Client Error: Forbidden for url: https://fbref.com/en/comps/40/playingtime/Scottish-Premiership-Stats

In [3]:
df.columns = [col[1].strip() if isinstance(col, tuple) else col for col in df.columns]
df.columns

Index(['Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'Born', 'MP',
       'Starts', 'Min', '90s', 'Gls', 'Ast', 'G+A', 'G-PK', 'PK', 'PKatt',
       'CrdY', 'CrdR', 'xG', 'npxG', 'xAG', 'npxG+xAG', 'PrgC', 'PrgP', 'PrgR',
       'Gls', 'Ast', 'G+A', 'G-PK', 'G+A-PK', 'xG', 'xAG', 'xG+xAG', 'npxG',
       'npxG+xAG', 'Matches'],
      dtype='object')

In [None]:
# Remove duplicates and filter rows
df = df.loc[:, ~df.columns.duplicated()]
df_filtered = df[df['Player'] != 'Player']

# Clean and convert columns
df_filtered['Min'] = pd.to_numeric(df_filtered['Min'], errors='coerce')
df_filtered = df_filtered[df_filtered['Min'] > 0.0]
#df_filtered = df_filtered[df_filtered['Pos'] != 'FW']


# Select and sort
df_filtered = df_filtered[['Player','Min']]
df_filtered = df_filtered.sort_values(by='Min', ascending=False)

# Print max xG and show data
df_filtered.head(20)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Min'] = pd.to_numeric(df_filtered['Min'], errors='coerce')


Unnamed: 0,Player,xG,xAG,Min
2264,Raphinha,19.2,12.7,2839
2114,Cole Palmer,17.3,10.9,3191
1102,Mason Greenwood,16.3,5.7,2804
1356,Arnaud Kalimuendo,14.9,2.0,2578
2242,Christian Pulisic,12.0,6.4,2478
1466,Andrej Kramarić,11.8,4.3,2767
2754,Deniz Undav,11.7,3.6,1726
2239,Javi Puado,11.6,3.0,2974
299,Jude Bellingham,11.4,3.9,2488
871,Breel Embolo,10.8,5.6,1841
