In [53]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import re

### Gathering league links for each of Europe's top 5 leagues (19/20 season)

In [377]:
url = "https://www.transfermarkt.com/wettbewerbe/europa"
headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
tree = requests.get(url, headers = headers)
soup = BeautifulSoup(tree.content, "lxml")

league_names = ["Premier League", "LaLiga", "Serie A", "Bundesliga", "Ligue 1"]
season = "/plus/?saison_id=2019"
http = "https://www.transfermarkt.com"
league_links = []
for league in league_names:
    link = soup.find("a", title=league).get("href")
    league_links.append(http+link+season)



In [378]:
league_links

['https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1/plus/?saison_id=2019',
 'https://www.transfermarkt.com/laliga/startseite/wettbewerb/ES1/plus/?saison_id=2019',
 'https://www.transfermarkt.com/serie-a/startseite/wettbewerb/IT1/plus/?saison_id=2019',
 'https://www.transfermarkt.com/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=2019',
 'https://www.transfermarkt.com/ligue-1/startseite/wettbewerb/FR1/plus/?saison_id=2019']

### Gathering respective team links

In [379]:
team_links = []

for page in league_links:
    tree = requests.get(page, headers = headers)
    soup = BeautifulSoup(tree.content, "lxml")
    
    for element in soup.select("td.hide-for-pad > a.vereinprofil_tooltip"):
        link = element.get("href")
        team_links.append(http+link)

In [395]:
print(len(team_links)) # number of teams should be 98
     

98


### Gathering player links  
we need to additonally enter into each players homepage because often their full name is listed in their homepage but not in the team page. We want their full name as this might make the merge to the fbref data go smoother. However the rest of the data will be taken from the team homepage as the player homepage contains age, value, etc all from 2020/2021 which is not the season we are analysing 

In [383]:
player_links = []
for page in team_links:
    tree = requests.get(page, headers=headers)
    soup = BeautifulSoup(tree.content, 'lxml')
    
    for element in soup.select("span.hide-for-small > a.spielprofil_tooltip"):
        link = element.get("href")
        player_links.append(http+link)
    

In [384]:
print(len(player_links))
player_links[:5]

3557


['https://www.transfermarkt.com/ederson/profil/spieler/238223',
 'https://www.transfermarkt.com/arijanet-muric/profil/spieler/371021',
 'https://www.transfermarkt.com/claudio-bravo/profil/spieler/40423',
 'https://www.transfermarkt.com/scott-carson/profil/spieler/14555',
 'https://www.transfermarkt.com/aymeric-laporte/profil/spieler/176553']

### Extracting age, club and value from the team page

In [390]:
ages = []
clubs = []
values = []

for page in team_links:
    tree = requests.get(page, headers=headers)
    soup = BeautifulSoup(tree.content, 'lxml')
    
    #scraping ages
    for element in soup.find_all("td", class_="zentriert", string=re.compile('\(')):
        birthdate = element.get_text()
        ages.append(birthdate.split('(')[1][:2])
    
    #scraping clubs         
    for element in soup.select("div#yw1 a.vereinprofil_tooltip img, div#yw1 a[title^=Without] img"):
        club = element.get("alt")
        clubs.append(club)


    #scraping values   
    for element in soup.select("div#yw1 td.rechts.hauptlink"):
        value = element.get_text()
        values.append(value)
    
                      

In [391]:
print(len(ages))
print(len(clubs))
print(len(values))

3557
3524
3557


### Extracting full name from player page if available, if not take incomplete version


In [340]:
names = []
for page in player_links:
        tree = requests.get(page, headers=headers)
        soup = BeautifulSoup(tree.content, 'lxml')   
        element = soup.select_one("div.spielerdaten table.auflistung td ")
        if element is None:
            names.append(element)
        elif bool(re.search(r'\d', element.get_text())):
            element = soup.find("h1", itemprop="name").get_text()
            names.append(element)
        else:
            names.append(element.get_text())
      

In [341]:
len(names)


3557

### Merging lists into 1 table via:  Lists > Dictionary > Dataframe > Saved CSV file


In [392]:
dic = {"Name":names, "Club":clubs, "Age":ages ,"Values":values}

#follwing two lines is to create a df eventhough club_list has a different lenght
df_market_val = pd.DataFrame.from_dict(dic, orient = 'index')
df_market_val=df_market_val.transpose()

df_market_val.to_csv(r'player_values_Euro_top5.csv', index=False)

In [398]:
df_market_val = pd.DataFrame.from_dict(dic, orient = 'index')
df_market_val.transpose()

Unnamed: 0,Name,Club,Age,Values
0,Ederson Santana de Moraes,Manchester City,25,€56.00m
1,Arijanet Anan Muriqi,Girona FC,20,€1.80m
2,Claudio Andrés Bravo Muñoz,Real Betis Balompié,36,€1.20m
3,Scott Paul Carson,Manchester City,33,€800Th.
4,Aymeric Jean Louis Gerard Alphonse Laporte,Manchester City,25,€60.00m
...,...,...,...,...
3552,Irvin Cardona,,21,€4.80m
3553,GaÃ«tan Charbonnier,,30,€2.00m
3554,Alexandre Mendy,,25,€1.80m
3555,Ulrich Kévin Selom Mayi,,26,€550Th.
