In [81]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [82]:
base_url = "https://www.worldfootball.net/players_list/eng-premier-league-2024-2025/nach-groesse/"

# an empty list created to store players data
players_dt = []

# for loop to go through each page of the base_url to allow for information retrieval
for page_num in range(1,14):
    
    url = f"{base_url}{page_num}/"
    
    response = requests.get(url)
    
    if response.status_code == 200:     # checks if the HTTP request was successful
        soup = BeautifulSoup(response.content, "html.parser")
        
        table = soup.find("table", class_="standard_tabelle")
        
        rows = table.find_all("tr")[1:]
        
        for row in rows:
            cols = row.find_all("td")
            
            player_name = cols[0].text.strip()
            team = cols[2].text.strip()
            dob = cols[3].text.strip()
            height = cols[4].text.strip()
            position = cols[5].text.strip()
        
            players_dt.append([player_name,team, dob, height, position])
    else:
        print(f'Failed to retrieve {page_num}')

In [83]:
# convert the players_df list to a dataframe
players_df = pd.DataFrame(players_dt, columns=['Player Name', 'Team', 'DOB', 'Height', 'Position'])
players_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 604 entries, 0 to 603
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Player Name  604 non-null    object
 1   Team         604 non-null    object
 2   DOB          604 non-null    object
 3   Height       604 non-null    object
 4   Position     604 non-null    object
dtypes: object(5)
memory usage: 23.7+ KB


In [87]:
players_df.tail()

Unnamed: 0,Player Name,Team,DOB,Height,Position
599,Luke Rawlings,Wolverhampton Wanderers,00/00/0000,???,MF
600,Jay Robinson,Southampton FC,15/03/2007,???,FW
601,Triston Rowe,Aston Villa,18/05/2006,???,DF
602,Martin Sherif,Everton FC,10/06/2006,???,FW
603,Tony Yogane,Brentford FC,24/09/2005,???,MF


Upon discovering that certain values in the Height columns have an invalid value (???), there's a need to handle this data inconsistency. 

In [88]:
# Isolate the rows with this data inconsistency
missing_height_rows = players_df[players_df['Height'] == '???']
missing_height_rows 

Unnamed: 0,Player Name,Team,DOB,Height,Position
585,Romeo Akachukwu,Southampton FC,28/07/2006,???,MF
586,Aidan Borland,Aston Villa,25/04/2007,???,MF
587,Kaden Braithwaite,Manchester City,25/03/2008,???,DF
588,Ben Broggio,Aston Villa,29/01/2007,???,MF
589,Justin Devenny,Crystal Palace,11/10/2003,???,MF
590,Roman Dixon,Everton FC,26/12/2004,???,DF
591,Tyrese Hall,Tottenham Hotspur,04/09/2005,???,MF
592,Joshua King,Fulham FC,03/01/2007,???,MF
593,Caleb Kporha,Crystal Palace,15/07/2006,???,DF
594,William Lankshear,Tottenham Hotspur,20/04/2005,???,FW


In [90]:
# to convert the Height column to a numeric data type, we take the following steps:
# replace '???' with NaN
players_df['Height'] = players_df['Height'].replace('???', pd.NA)

# remove all ' cm' behind each number
players_df['Height'] = players_df['Height'].str.replace(' cm', '', regex=False)

# convert the Height column to numeric values
players_df['Height'] = pd.to_numeric(players_df['Height'], errors='coerce')

# fill the empty columns with the median height
median_height = players_df['Height'].median()
players_df['Height'].fillna(median_height, inplace=True)

# Round the values to get rid of the decimal and convert to integer
players_df['Height'] = players_df['Height'].astype(int)

In [91]:
players_df.head()

Unnamed: 0,Player Name,Team,DOB,Height,Position
0,Lucas Bergström,Chelsea FC,05/09/2002,205,GK
1,Carlos Miguel,Nottingham Forest,09/10/1998,204,GK
2,Dan Burn,Newcastle United,09/05/1992,201,DF
3,Fraser Forster,Tottenham Hotspur,17/03/1988,201,GK
4,Paul Onuachu,Southampton FC,28/05/1994,201,FW


In [92]:
# include cm on the height column header to indicate that the unit is in centimeter (cm)
players_df = players_df.rename(columns={"Height": "Height (cm)",})

In [93]:
# convert 'DOB' to datetime
players_df['DOB'] = pd.to_datetime(players_df['DOB'], dayfirst=True, errors='coerce')

In [96]:
players_df.head()

Unnamed: 0,Player Name,Team,DOB,Height (cm),Position
0,Lucas Bergström,Chelsea FC,2002-09-05,205,GK
1,Carlos Miguel,Nottingham Forest,1998-10-09,204,GK
2,Dan Burn,Newcastle United,1992-05-09,201,DF
3,Fraser Forster,Tottenham Hotspur,1988-03-17,201,GK
4,Paul Onuachu,Southampton FC,1994-05-28,201,FW


In [97]:
# export dataframe as a csv file
players_df.to_csv('epl_players_data.csv', index=False)