In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Base URL for Transfermarkt
base_url = 'https://www.transfermarkt.com'

# URL for Premier League overview page for the 2023/2024 season
league_url = f'{base_url}/premier-league/startseite/wettbewerb/GB1/saison_id/2023'

# Headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
}

# Request the league page
response = requests.get(league_url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all team links within the teams table
teams_table = soup.find('table', class_='items')
team_links = []
for row in teams_table.find_all('tr', {'class': ['odd', 'even']}):
    team_tag = row.find('td', class_='hauptlink no-border-links').find('a')
    if team_tag:
        team_name = team_tag.text.strip()
        team_url = base_url + team_tag['href']
        team_links.append((team_name, team_url))

team_links

[('Manchester City',
  'https://www.transfermarkt.com/manchester-city/startseite/verein/281/saison_id/2023'),
 ('Arsenal FC',
  'https://www.transfermarkt.com/fc-arsenal/startseite/verein/11/saison_id/2023'),
 ('Chelsea FC',
  'https://www.transfermarkt.com/fc-chelsea/startseite/verein/631/saison_id/2023'),
 ('Liverpool FC',
  'https://www.transfermarkt.com/fc-liverpool/startseite/verein/31/saison_id/2023'),
 ('Tottenham Hotspur',
  'https://www.transfermarkt.com/tottenham-hotspur/startseite/verein/148/saison_id/2023'),
 ('Manchester United',
  'https://www.transfermarkt.com/manchester-united/startseite/verein/985/saison_id/2023'),
 ('Aston Villa',
  'https://www.transfermarkt.com/aston-villa/startseite/verein/405/saison_id/2023'),
 ('Newcastle United',
  'https://www.transfermarkt.com/newcastle-united/startseite/verein/762/saison_id/2023'),
 ('Brighton & Hove Albion',
  'https://www.transfermarkt.com/brighton-amp-hove-albion/startseite/verein/1237/saison_id/2023'),
 ('Nottingham Fores

In [8]:
# Container for storing injury data
injury_data = []

# Loop over each team
for team_name, team_url in team_links:
    print(f"Processing team: {team_name}")
    
    # Fetch the team squad page
    response = requests.get(team_url, headers=headers)
    team_soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all player rows
    for player_row in team_soup.select('tr.odd, tr.even'):
        # Extract player name and profile link
        name_tag = player_row.select_one('td.hauptlink a')
        if name_tag:
            player_name = name_tag.text.strip()
            player_profile_url = base_url + name_tag['href'].strip()
            
            # Extract player position
            position = player_row.find('td', class_='posrela').find_all('td')[-1].text.strip()
            
            # Extract birthdate and age
            birthdate_age = player_row.select_one('td.zentriert').text.strip()
            
            # Extract nationality (handling multiple nationalities if present)
            nationality_imgs = player_row.select('td.zentriert img.flaggenrahmen')
            nationalities = [img['title'] for img in nationality_imgs]
            nationality = ", ".join(nationalities)
            
            # Extract market value
            market_value_tag = player_row.find('td', class_='rechts hauptlink')
            market_value = market_value_tag.text.strip() if market_value_tag else 'N/A'
            
            # Construct the injury page URL
            injury_url = player_profile_url.replace("profil", "verletzungen")
            
            # Fetch the injury page
            response = requests.get(injury_url, headers=headers)
            player_soup = BeautifulSoup(response.content, 'html.parser')
            
            # Locate the injury history table
            injury_table = player_soup.find('table', class_='items')
            
            if injury_table:
                # Loop through each row of the injury table
                for row in injury_table.find_all('tr', class_=['odd', 'even']):
                    cols = row.find_all('td')
                    if cols:
                        season = cols[0].text.strip()
                        injury_type = cols[1].text.strip()
                        start_date = cols[2].text.strip()
                        end_date = cols[3].text.strip()
                        days_out = cols[4].text.strip()
                        games_missed = cols[5].text.strip()
                        
                        # Append the player's injury data
                        injury_data.append({
                            'Team': team_name,
                            'Player': player_name,
                            'Position': position,
                            'Birthdate and Age': birthdate_age,
                            'Nationality': nationality,
                            'Market Value': market_value,
                            'Season': season,
                            'Injury Type': injury_type,
                            'Start Date': start_date,
                            'End Date': end_date,
                            'Days Out': days_out,
                            'Games Missed': games_missed
                        })
            
            # Be polite and pause to avoid overloading the server
            time.sleep(1)
    
    # Pause between teams
    time.sleep(2)

Processing team: Manchester City
Processing team: Arsenal FC
Processing team: Chelsea FC
Processing team: Liverpool FC
Processing team: Tottenham Hotspur
Processing team: Manchester United
Processing team: Aston Villa
Processing team: Newcastle United
Processing team: Brighton & Hove Albion
Processing team: Nottingham Forest
Processing team: West Ham United
Processing team: Crystal Palace
Processing team: Wolverhampton Wanderers
Processing team: Brentford FC
Processing team: AFC Bournemouth
Processing team: Everton FC
Processing team: Fulham FC
Processing team: Burnley FC
Processing team: Sheffield United
Processing team: Luton Town


In [9]:
# Convert data to DataFrame
df = pd.DataFrame(injury_data)

# Save data to a CSV file
df.to_csv('premier_league_injury_history_2023_2024.csv', index=False)
print("Injury history data saved to 'premier_league_injury_history_2023_2024.csv'")

Injury history data saved to 'premier_league_injury_history_2023_2024.csv'
