In [1]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import pandas as pd
from random import randint
import logging
from time import sleep


In [2]:
def getSoup(page):
    HEADERS ={
    "Accept-Language": "en-US,en;q=0.5",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",}

    response = requests.get(page, headers=HEADERS)
    soup= BeautifulSoup(response.content, 'html.parser')
    time_milliseconds = randint(500, 2000)
    time_sec = 0.001 * time_milliseconds
    sleep(time_sec)
    return soup
    pass
    
   


In [3]:
def getft(page):
    soup=getSoup(page)
    ft = soup.select_one('.hauptlink > .inline-table > tr > td:nth-child(2) > a')['href']
    ft="https://www.transfermarkt.de"+ft
    return ft
    pass


In [4]:
def get_season_links(page):
    soup=getSoup(page)
    divs = soup.find_all("div", class_="footer-links") 
    hrefs = [] 
    for div in divs:
        hrefs.append(div.a["href"]) 
    ft = hrefs[3]
   
    ft="https://www.transfermarkt.de"+ft
    
    return ft



    

In [5]:
def get_season_data (page) :
    soup = getSoup(page)
    result = {}
    result['id'] = int(page.rsplit('/', 1)[1]) 
      
    match = re.search(r"/([\w-]+)/", page)
    if match:
        result['name'] =match.group(1)
    else:
        result['name'] =None
        
    start_date = soup.select_one('#main > main > div:nth-child(5) > div:nth-child(2) > div > table > tbody > tr:nth-child(2) > td:nth-child(1) > a').get_text().strip()
    result['start_date'] = datetime.strptime(start_date, '%d.%m.%y').date()
    end_date = soup.select_one('#main > main > div:nth-child(5) > div.large-6.columns.end > div > table > tbody > tr:nth-child(2) > td:nth-child(1) > a').get_text().strip()
    result['end_date'] = datetime.strptime(end_date, '%d.%m.%y').date()
    return result

In [6]:

def competiton_table(page) :
    soup = getSoup(page)

    result = {}
    result['id'] =page.rsplit('/', 1)[1]
    try:
        links = soup.select_one('#main > main > header > div.data-header__box--big > div > span.data-header__club > a').get('href')

        result['country_id'] = int(links.rsplit('/', 1)[1])
    except:
        result['country_id'] = None
    name = soup.select_one('#main > main > header > div.data-header__headline-container > h1').get_text().strip()
    result['name'] = name
    try :
        result['teams_number'] = int(soup.select_one('#main > main > header > div.data-header__info-box > div > ul:nth-child(1) > li:nth-child(1) > span').get_text().strip()[:2])
    except :
        result['teams_number'] = int(soup.select_one('    #main > main > header > div.data-header__info-box > div > ul:nth-child(1) > li:nth-child(2) > span').get_text().strip()[:2])
    try :
        market_value = soup.select_one('#main > main > header > div.data-header__info-box > div > ul:nth-child(2) > li:nth-child(1) > span').get_text().strip()
        market_value =  market_value.replace('€','').replace('Mio','').replace('.','').replace(',','').strip()
        result['market_value'] = float(market_value) * 10000
  
    except :
        market_value = soup.select_one('#main > main > header > div.data-header__info-box > div > ul:nth-child(2) > li:nth-child(1) > span').get_text().strip()
        market_value =  market_value.replace('€','').replace('Mio','').replace('.','').replace(',','').strip()
        result['market_value'] = float(market_value) * 10000
       
    if name != 'UEFA Champions League' :
        result['players_number'] = int(soup.select_one('#main > main > header > div.data-header__info-box > div > ul:nth-child(1) > li:nth-child(2) > span').get_text().strip())
    else :
        result['players_number'] = None

    avg_age = soup.select_one(' #main > main > header > div.data-header__info-box > div > ul:nth-child(2) > li:nth-child(2) > span').get_text().strip()
    try:
        avg_age = soup.select_one(' #main > main > header > div.data-header__info-box > div > ul:nth-child(2) > li:nth-child(2) > span').get_text().strip()
        avg_age=  avg_age.replace(',','.')
        result['avg_age'] = float(avg_age)
    except:
        result['avg_age'] = None

    try :
        result['foreigners'] = int(soup.select_one(' ul:nth-child(1) > li:nth-child(3) > span > a').get_text()[:3])
    except :
        result['foreigners'] = None 
    print(result)
    return result



   

In [7]:
def main():
    df1 = pd.DataFrame(columns = ['id', 'country_id','name','teams_number','market_value','players_number','avg_age','foreigners'])
    df2 = pd.DataFrame(columns = ['id', 'name','start_date', 'end_date'])
   
    countries_links=["https://www.transfermarkt.de/wettbewerbe/national/wettbewerbe/189",
                     "https://www.transfermarkt.de/wettbewerbe/national/wettbewerbe/40",
                     "https://www.transfermarkt.de/wettbewerbe/national/wettbewerbe/75",
                     "https://www.transfermarkt.de/wettbewerbe/national/wettbewerbe/50",
                     "https://www.transfermarkt.de/wettbewerbe/national/wettbewerbe/157"]
    seasons=[2015,2016,2017,2018,2019,2020,2021]
    for i in countries_links:
        for season in seasons:

            country_detailed=i+f"/saison_id/{season}/plus/1"
            ft_link=getft(country_detailed)
            competiton_id = competiton_table(ft_link)
            season_link = get_season_links(ft_link)
            seasons_data = get_season_data(season_link)
    
            df1.loc[len(df1)] = competiton_id
            df2.loc[len(df2)] = seasons_data
            
    df1.to_csv('competition.csv')
    df2.to_csv('seasons.csv')
          
  

     

    pass

In [8]:
if __name__ == "__main__":
    main()

{'id': '2015', 'country_id': 189, 'name': 'Premier League', 'teams_number': 20, 'market_value': 19480000.0, 'players_number': 535, 'avg_age': 27.1, 'foreigners': 367}
{'id': '2016', 'country_id': 189, 'name': 'Premier League', 'teams_number': 20, 'market_value': 19480000.0, 'players_number': 535, 'avg_age': 27.1, 'foreigners': 367}
{'id': '2017', 'country_id': 189, 'name': 'Premier League', 'teams_number': 20, 'market_value': 19480000.0, 'players_number': 535, 'avg_age': 27.1, 'foreigners': 367}
{'id': '2018', 'country_id': 189, 'name': 'Premier League', 'teams_number': 20, 'market_value': 19480000.0, 'players_number': 535, 'avg_age': 27.1, 'foreigners': 367}
{'id': '2019', 'country_id': 189, 'name': 'Premier League', 'teams_number': 20, 'market_value': 19480000.0, 'players_number': 535, 'avg_age': 27.1, 'foreigners': 367}
{'id': '2020', 'country_id': 189, 'name': 'Premier League', 'teams_number': 20, 'market_value': 19480000.0, 'players_number': 535, 'avg_age': 27.1, 'foreigners': 367

KeyboardInterrupt: 