In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [10]:
# List of leagues with their respective URL, league name, and country
leagues = [
    ("https://www.transfermarkt.us/premier-league/startseite/wettbewerb/GB1/saison_id/2022", "Premier League", "England"),
    ("https://www.transfermarkt.us/ligue-1/startseite/wettbewerb/FR1/saison_id/2022", "Ligue 1", "France"),
    ("https://www.transfermarkt.us/laliga/startseite/wettbewerb/ES1/plus/?saison_id=2022", "La Liga", "Spain"),
    ("https://www.transfermarkt.us/serie-a/startseite/wettbewerb/IT1/plus/?saison_id=2022", "Serie A", "Italy"),
    ("https://www.transfermarkt.us/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=2022", "Bundesliga", "Germany")
]

In [11]:
#Set a user agent string in the request headers to mimic a web browser and make the request appear more like a legitimate user.
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

In [12]:
def scrape_leagues(leagues):
    club_data = []

    for league in leagues:
        url, league_name, country = league

        # Send a GET request to the URL
        response = requests.get(url, headers=headers)

        # Create a BeautifulSoup object to parse the HTML content
        soup = BeautifulSoup(response.content, "html.parser")

        # Find all the club elements
        club_elements = soup.find_all("td", class_="hauptlink no-border-links")

        # Iterate over the club elements and extract the club name and href
        for club in club_elements:
            # Extract the club name
            club_name = club.text.strip()

            # Extract the href link
            club_href = club.find("a")["href"]

            club_data.append((league_name, country, club_name, club_href))

    # Create a DataFrame from the extracted club data
    df = pd.DataFrame(club_data, columns=["LEAGUE_NAME", "COUNTRY", "CLUB", "CLUB_HREF"])

    return df

In [13]:
# Call the function to scrape the leagues and store the data in a DataFrame
result_df = scrape_leagues(leagues)

In [14]:
# Print the resulting DataFrame
result_df.head()

Unnamed: 0,LEAGUE_NAME,COUNTRY,CLUB,CLUB_HREF
0,Premier League,England,Manchester City,/manchester-city/startseite/verein/281/saison_...
1,Premier League,England,Arsenal FC,/fc-arsenal/startseite/verein/11/saison_id/2022
2,Premier League,England,Chelsea FC,/fc-chelsea/startseite/verein/631/saison_id/2022
3,Premier League,England,Manchester United,/manchester-united/startseite/verein/985/saiso...
4,Premier League,England,Liverpool FC,/fc-liverpool/startseite/verein/31/saison_id/2022


In [15]:
result_df.shape

(98, 4)

In [16]:
# Save the DataFrame to a CSV file
result_df.to_csv("datasets/1_top_5_leagues_clubs_2223_season.csv", index=False)