In [28]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [29]:
# Base URL for the Transfermarkt website
base_url = "https://www.transfermarkt.us"

In [30]:
# Path to the CSV file containing club information
csv_file = "datasets/1_top_5_leagues_clubs_2223_season.csv"

In [31]:
#Set a user agent string in the request headers to mimic a web browser and make the request appear more like a legitimate user.
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

In [32]:
def scrape_player_info(base_url, csv_file):
    # Read the CSV file containing club information
    club_df = pd.read_csv(csv_file)

    # Create lists to store the extracted player information
    club_names = []
    player_names = []
    player_values = []
    player_hrefs = []
    club_hrefs = []
    club_leagues = []
    club_countries = []

    # Iterate over each club
    for _, row in club_df.iterrows():
        club_name = row['CLUB']
        club_href = row['CLUB_HREF']
        club_league = row['LEAGUE_NAME']
        club_country = row['COUNTRY']

        # Create the URL for the club's page
        club_url = base_url + club_href

        # Send a GET request to the club's page
        response = requests.get(club_url, headers=headers)

        # Create a BeautifulSoup object to parse the HTML content
        soup = BeautifulSoup(response.content, "html.parser")

        # Find all the player elements
        player_elements = soup.find_all("span", class_="show-for-small")
        
        # Iterate over the player elements and extract the player name and href
        for player in player_elements:
            # Extract the player name
            player_name = player.text.strip()

            # Extract the href link
            player_href = player.find("a")["href"]
            
            #Find player current market value
            player_href_value = player_href.replace("/profil", "/marktwertverlauf")
            player_value_element = soup.find("a", href=player_href_value)
            if player_value_element is not None:
                player_value = player_value_element.text.strip()
            else:
                player_value = "NA"  # or any default value you prefer
        
            club_names.append(club_name)
            player_names.append(player_name)
            player_values.append(player_value)
            player_hrefs.append(player_href)
            club_hrefs.append(club_url)
            club_leagues.append(club_league)
            club_countries.append(club_country)

    # Create a DataFrame from the extracted player information
    data = {
        "CLUB_NAME": club_names,
        "PLAYER_NAME": player_names,
        "PLAYER_VALUE": player_values,
        "PLAYER_HREF": player_hrefs,
        "CLUB_HREF": club_hrefs,
        "LEAGUE_NAME": club_leagues,
        "LEAGUE_COUNTRY" : club_countries
    }

    df = pd.DataFrame(data)

    return df

In [33]:
# Call the function to scrape player information and store it in a DataFrame
result_df = scrape_player_info(base_url, csv_file)

In [34]:
# Print the resulting DataFrame
result_df.head(10)

Unnamed: 0,CLUB_NAME,PLAYER_NAME,PLAYER_VALUE,PLAYER_HREF,CLUB_HREF,LEAGUE_NAME,LEAGUE_COUNTRY
0,Manchester City,Ederson,€40.00m,/ederson/profil/spieler/238223,https://www.transfermarkt.us/manchester-city/s...,Premier League,England
1,Manchester City,S. Ortega,€9.00m,/stefan-ortega/profil/spieler/85941,https://www.transfermarkt.us/manchester-city/s...,Premier League,England
2,Manchester City,Z. Steffen,€3.50m,/zack-steffen/profil/spieler/221624,https://www.transfermarkt.us/manchester-city/s...,Premier League,England
3,Manchester City,S. Carson,€200k,/scott-carson/profil/spieler/14555,https://www.transfermarkt.us/manchester-city/s...,Premier League,England
4,Manchester City,R. Dias,€80.00m,/ruben-dias/profil/spieler/258004,https://www.transfermarkt.us/manchester-city/s...,Premier League,England
5,Manchester City,N. Aké,€42.00m,/nathan-ake/profil/spieler/177476,https://www.transfermarkt.us/manchester-city/s...,Premier League,England
6,Manchester City,J. Stones,€40.00m,/john-stones/profil/spieler/186590,https://www.transfermarkt.us/manchester-city/s...,Premier League,England
7,Manchester City,M. Akanji,€38.00m,/manuel-akanji/profil/spieler/284730,https://www.transfermarkt.us/manchester-city/s...,Premier League,England
8,Manchester City,A. Laporte,€25.00m,/aymeric-laporte/profil/spieler/176553,https://www.transfermarkt.us/manchester-city/s...,Premier League,England
9,Manchester City,L. Mbete,€2.00m,/luke-mbete/profil/spieler/609883,https://www.transfermarkt.us/manchester-city/s...,Premier League,England


In [35]:
# Save the DataFrame to a CSV file
result_df.to_csv("datasets/2_players_information_2223_season.csv", index=False)