In [28]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [29]:
# Base URL for the Transfermarkt website
base_url = "https://www.transfermarkt.us"

In [30]:
# Path to the CSV file containing player hrefs
csv_file = "players_information_2223_season.csv"

In [31]:
#Set a user agent string in the request headers to mimic a web browser and make the request appear more like a legitimate user.
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

In [32]:
def validate_text(text):
    return text.strip() if text else "NA"

In [33]:
def scrape_player_info(base_url, csv_file):
    # Read the CSV file containing player hrefs
    player_df = pd.read_csv(csv_file)

    # Create lists to store the extracted player information
    player_hrefs = []
    full_names = []
    dates_of_birth = []
    ages = []
    heights = []
    citizenships = []
    preferred_feet = []
    positions = []
    current_internationals = []
    current_market_values = []
    outfitters = []

    # Get the total number of players
    total_players = len(player_df)

    # Counter for players scraped
    players_scraped = 0

    # Iterate over each player href
    for _, row in player_df.iterrows():
        player_href = row['Player Href']

        # Create the URL for the player's page
        player_url = base_url + player_href

        # Send a GET request to the players's page
        response = requests.get(player_url, headers=headers)

        # Create a BeautifulSoup object to parse the HTML content
        soup = BeautifulSoup(response.content, "html.parser")
    
        # Extract player information
        facts_data = soup.findAll("span", class_="info-table__content info-table__content--bold")

        #Validate and extract player information
        full_name = validate_text(facts_data[0].text) if len(facts_data) > 0 else "NA"
        date_of_birth = validate_text(facts_data[1].text) if len(facts_data) > 1 else "NA"
        age = validate_text(facts_data[3].text) if len(facts_data) > 3 else "NA"
        height = validate_text(facts_data[4].text) if len(facts_data) > 4 else "NA"
        citizenship = validate_text(facts_data[5].text) if len(facts_data) > 5 else "NA"
        position = validate_text(facts_data[6].text) if len(facts_data) > 6 else "NA"
        preferred_foot = validate_text(facts_data[7].text) if len(facts_data) > 7 else "NA"
    
        current_international_element = soup.find("span", itemprop="nationality")
        current_international = validate_text(current_international_element.text) if current_international_element else "NA"

        current_market_value_element = soup.find("div", class_="tm-player-market-value-development__current-value")
        current_market_value = validate_text(current_market_value_element.text) if current_market_value_element else "NA"
        
        outfitter = validate_text(facts_data[10].text) if len(facts_data) > 10 else "NA"

        # Append player information to the respective lists
        player_hrefs.append(player_href)
        full_names.append(full_name)
        dates_of_birth.append(date_of_birth)
        ages.append(age)
        heights.append(height)
        citizenships.append(citizenship)
        preferred_feet.append(preferred_foot)
        positions.append(position)
        current_internationals.append(current_international)
        current_market_values.append(current_market_value)
        outfitters.append(outfitter)

        # Increment the players scraped counter
        players_scraped += 1

        # Display the progress
        print(f"Scraping player {players_scraped} of {total_players}")
        
    # Create a DataFrame from the extracted player information
    data = {
        "Player Href": player_hrefs,
        "Full Name": full_names,
        "Date of Birth": dates_of_birth,
        "Age": ages,
        "Height": heights,
        "Citizenship": citizenships,
        "Preferred Foot": preferred_feet,
        "Position": positions,
        "Current International": current_internationals,
        "Current Market Value": current_market_values,
        "Outfitter": outfitters
    }

    df = pd.DataFrame(data)

    return df

In [34]:
# Call the function to scrape player information and store it in a DataFrame
result_df = scrape_player_info(base_url, csv_file)

Scraping player 1 of 3792
Scraping player 2 of 3792
Scraping player 3 of 3792
Scraping player 4 of 3792
Scraping player 5 of 3792
Scraping player 6 of 3792
Scraping player 7 of 3792
Scraping player 8 of 3792
Scraping player 9 of 3792
Scraping player 10 of 3792
Scraping player 11 of 3792
Scraping player 12 of 3792
Scraping player 13 of 3792
Scraping player 14 of 3792
Scraping player 15 of 3792
Scraping player 16 of 3792
Scraping player 17 of 3792
Scraping player 18 of 3792
Scraping player 19 of 3792
Scraping player 20 of 3792
Scraping player 21 of 3792
Scraping player 22 of 3792
Scraping player 23 of 3792
Scraping player 24 of 3792
Scraping player 25 of 3792
Scraping player 26 of 3792
Scraping player 27 of 3792
Scraping player 28 of 3792
Scraping player 29 of 3792
Scraping player 30 of 3792
Scraping player 31 of 3792
Scraping player 32 of 3792
Scraping player 33 of 3792
Scraping player 34 of 3792
Scraping player 35 of 3792
Scraping player 36 of 3792
Scraping player 37 of 3792
Scraping p

In [35]:
# Save the DataFrame to a CSV file
result_df.to_csv("players_main_info_2223_season.csv", index=False)

In [36]:
middle_url_detailed = "/leistungsdatendetails" #replace profil for leistungsdatendetails
tail_url_detailed = "saison/2022/verein/0/liga/0/wettbewerb//pos/0/trainer_id/0/plus/1"