In [1]:
'''
This scripts collects information about
each player. It collects his basic information,
his transfer history and all the leagues he played for.
'''

'\nThis scripts collects information about\neach player. It collects his basic information,\nhis transfer history and all the leagues he played for.\n'

In [2]:
from bs4 import BeautifulSoup
from io import BytesIO
import glob
import json
import pandas as pd
from pathlib import Path
from pprint import pprint
import requests
import re
from tqdm.notebook import tqdm
import time

In [3]:
HEADERS = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'
    }

In [4]:
def already_saved(player_id):
    '''
    Controls whether the data for this specific player
    was already saved properly. Returns true or false.
    
    Params:
    
    player_id -> string with the unique player id at Transfermarkt
    '''
    
    player_files = glob.glob("../output/players/basic-info-*.json")
    transfer_files = glob.glob("../output/players/transfer-info-*.json")
    league_files = glob.glob("../output/players/league-info-*.json")
    club_files = glob.glob("../output/players/club-info-*.json")
    
    player_files = list(map(lambda x: re.search("\d+", x).group(0), player_files))
    transfer_files = list(map(lambda x: re.search("\d+", x).group(0), transfer_files))
    league_files = list(map(lambda x: re.search("\d+", x).group(0), league_files))
    club_files = list(map(lambda x: re.search("\d+", x).group(0), club_files))
    
    if player_id in player_files and player_id in league_files and \
    player_id in club_files and player_id in transfer_files:
        
        return True

In [5]:
def player_page_request(player_url): 
    '''
    Makes a request to the page containing
    detailed information about the player
    and returns it as a response object.
    
    Params:
    
    player_url -> string with the url path to the player page at Transfermarkt
    '''
    
    r = requests.get(player_url, headers=HEADERS)
    
    return r

In [6]:
def scrape_basic_info(player_r):
    '''
    Collects the basic player information.
    
    Params:
    
    player_r -> a response object retrieved from 
    the player page at transfermarkt
    '''
    
    player_info = {}
    
    soup = BeautifulSoup(player_r.text)

    player_data = soup.find('div', class_='info-table')
    categories = player_data.find_all('span', class_='info-table__content--regular')
    values = player_data.find_all('span', class_='info-table__content--bold')

    for cat, val in zip(categories, values):
        
        if cat.text.strip() in ['Name in home country:', 'Full name:', 'Date of birth:', 
                        'Age:', 'Height:', 
                        'Position:', 'Foot:', 'Current club:', 'Joined:']:

            label = cat.text.replace(":", "").strip().lower().replace(" ", "_")
            item = val.text.strip()

            player_info[label] = item
            
        if cat.text.strip() in ['Current club:']:
            player_info['current_club_url'] = val.find('a', href=True)['href']
            player_info['current_club_id'] = re.search("/verein/(\d+)", player_info['current_club_url']).group(1)


        if cat.text.strip() == "Citizenship:":
            citizenship = val.find("img", class_="flaggenrahmen", alt=True)['alt'] # In cases of double citizenship, this will get the first one only
            player_info['citizenship'] = citizenship
            
        if cat.text.strip() == 'Place of birth:':

            # Add country before saving
            label = cat.text.replace(":", "").strip().lower().replace(" ", "_")

            country = val.find('img', class_='flaggenrahmen', alt=True)['alt']
            item = f'{val.text.strip()}, {country}'

            player_info[label] = item
            
    return player_info

In [7]:
def save_json(data, outpath):
    '''
    Saves a JSON file in the specified output path
    '''
    with open(outpath, 'w+') as f:
        json.dump(data, f)

In [8]:
def scrape_transfer_info(player_r):
    '''
    Scrapes all transfers from a player
    
    Params:
    
    player_r -> a response object retrieved from 
    the player page at transfermarkt
    '''
    
    soup = BeautifulSoup(player_r.text)
    transfers = soup.find_all('div', class_='grid tm-player-transfer-history-grid')
    
    rows = []

    for transfer in transfers:
        entries = transfer.find_all("div", class_="grid__cell")
        # Removes the fee and market value entries
        entries = entries[:-2]

        season = entries[0].text.strip()
        date = entries[1].text.strip()
        left = entries[2].text.strip()
        left_url = entries[2].find('a')['href']
        left_club_id = re.search('verein/(\d+)/saison_id/', left_url).group(1)
        joined = entries[3].text.strip()
        joined_url = entries[3].find('a')['href']
        joined_club_id = re.search('verein/(\d+)/saison_id/', joined_url).group(1)

        datum = {
            'season': season,
            'date': date,
            'left': left,
            'left_url': left_url,
            'left_club_id': left_club_id,
            'joined': joined,
            'joined_url': joined_url,
            'joined_club_id': joined_club_id,

        }

        rows.append(datum)
        
    return rows

In [9]:
def scrape_league_info(player_id):
    '''
    Scrapes the league information for each player.

    Params:

    player_id -> string with the unique player id at Transfermarkt
    '''
    
    # Funny, we don't actually need the player name on the url
    league_details_url = f'https://www.transfermarkt.com/dummy/leistungsdaten/spieler/{player_id}/plus/0?saison=ges'
    print(league_details_url)
    
    time.sleep(1)

    r = requests.get(league_details_url, headers=HEADERS)
    soup = BeautifulSoup(r.text)
    
    
    league_table_div = soup.find('div', class_='responsive-table')
    league_table = league_table_div.find("table", class_='items')
    tbody = league_table.find("tbody")
    
    rows = tbody.find_all("tr")
    
    data = []
    for row in rows:
        cells = row.find_all("td")

        league = cells[1].find('a')
        league_url = league['href']
        league_name = league['title']
        league_id = re.search("\/startseite\/\w+\/(.+)", league_url).group(1)

        matches = cells[2].text
        minutes = cells[-1].text

        datum = {
            "league_url": league_url,
            "league_name": league_name,
            "league_id": league_id,
            "matches": matches,
            "minutes": minutes,
        }
        
        data.append(datum)
                  

    return data


In [10]:
def scrape_club_info(player_id):
    '''
    Scrapes the club information for each player.
    
    Params:

    player_id -> string with the unique player id at Transfermarkt
    '''
    
    club_details_url = f'https://www.transfermarkt.com/dummy/leistungsdatenverein/spieler/{player_id}'
    print(club_details_url)
    
    time.sleep(1)

    
    r = requests.get(club_details_url, headers=HEADERS)
    soup = BeautifulSoup(r.text)

    club_table_div = soup.find('div', class_='responsive-table')
    club_table = club_table_div.find("table", class_='items')
    
    tbody = club_table.find("tbody")
    
    rows = tbody.find_all("tr")
    
    data = []
    for row in rows:
        cells = row.find_all("td")

        club = cells[1].find('a')
        club_url = club['href']
        club_name = club['title']
        club_id = re.search("\/startseite\/\w+\/(.+)", club_url).group(1)

        matches = cells[2].text
        minutes = cells[-1].text

        datum = {
            "club_url": club_url,
            "club_name": club_name,
            "club_id": club_id,
            "matches": matches,
            "minutes": minutes,
        }
        
        data.append(datum)
                  

    return data

In [11]:
def main():
    
    df = pd.read_csv('../output/2022-wikipedia-to-transfermarkt.csv', sep=',')
    
    all_files = glob.glob("../output/players/*.json")
    right_ids = df.transfermarkt_id.astype(str).unique().tolist()
        
    for file in all_files:
        
        id_ = re.search("(\d+)", file).group(1)
        
        if id_ not in right_ids:
            print('deleting', file) # to do – turn into delete
            file_to_rem = Path(file)
            file_to_rem.unlink()
    
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
                
        print(row.wikipedia_page)
        
        # Progress check
        player_url = f"https://www.transfermarkt.com/dummy/profil/spieler/{row.transfermarkt_id}"
        player_id = re.search('/spieler/(\d+)', player_url).group(1)

        if already_saved(player_id):
            print("Already collected!")
            print()
            continue
            

        # If the players was not saved, let's fetch basic data
        print(player_url)
        player_r = player_page_request(player_url)
        
        time.sleep(1)

        player_info = scrape_basic_info(player_r)
#         player_info['fetched_name'] = row.fetched_name
#         player_info['national_team'] = row.team
        save_json(player_info, f"../output/players/basic-info-{player_id}.json")
        
        # Scrapes transfer data
        transfer_info = scrape_transfer_info(player_r)
        save_json(transfer_info, f"../output/players/transfer-info-{player_id}.json")

        # Collect league data
        league_info = scrape_league_info(player_id)
        save_json(league_info, f"../output/players/league-info-{player_id}.json")

          # Collect club data
        club_info = scrape_club_info(player_id)
        save_json(club_info, f"../output/players/club-info-{player_id}.json")

        print()

In [12]:
if __name__ == "__main__":
    main()

  0%|          | 0/831 [00:00<?, ?it/s]

https://en.wikipedia.org/wiki/Remko_Pasveer
Already collected!

https://en.wikipedia.org/wiki/Jurri%C3%ABn_Timber
Already collected!

https://en.wikipedia.org/wiki/Matthijs_de_Ligt
Already collected!

https://en.wikipedia.org/wiki/Virgil_van_Dijk
Already collected!

https://en.wikipedia.org/wiki/Nathan_Ak%C3%A9
Already collected!

https://en.wikipedia.org/wiki/Stefan_de_Vrij
Already collected!

https://en.wikipedia.org/wiki/Steven_Bergwijn
Already collected!

https://en.wikipedia.org/wiki/Cody_Gakpo
Already collected!

https://en.wikipedia.org/wiki/Luuk_de_Jong
Already collected!

https://en.wikipedia.org/wiki/Memphis_Depay
Already collected!

https://en.wikipedia.org/wiki/Steven_Berghuis
Already collected!

https://en.wikipedia.org/wiki/Noa_Lang
Already collected!

https://en.wikipedia.org/wiki/Justin_Bijlow
Already collected!

https://en.wikipedia.org/wiki/Davy_Klaassen
Already collected!

https://en.wikipedia.org/wiki/Marten_de_Roon
Already collected!

https://en.wikipedia.org/wiki/

Already collected!

https://en.wikipedia.org/wiki/Walker_Zimmerman
Already collected!

https://en.wikipedia.org/wiki/Tyler_Adams
Already collected!

https://en.wikipedia.org/wiki/Antonee_Robinson
Already collected!

https://en.wikipedia.org/wiki/Yunus_Musah
Already collected!

https://en.wikipedia.org/wiki/Giovanni_Reyna
Already collected!

https://en.wikipedia.org/wiki/Weston_McKennie
Already collected!

https://en.wikipedia.org/wiki/Jes%C3%BAs_Ferreira
Already collected!

https://en.wikipedia.org/wiki/Christian_Pulisic
Already collected!

https://en.wikipedia.org/wiki/Brenden_Aaronson
Already collected!

https://en.wikipedia.org/wiki/Ethan_Horvath
Already collected!

https://en.wikipedia.org/wiki/Tim_Ream
Already collected!

https://en.wikipedia.org/wiki/Luca_de_la_Torre
Already collected!

https://en.wikipedia.org/wiki/Aaron_Long_(soccer)
Already collected!

https://en.wikipedia.org/wiki/Jordan_Morris
Already collected!

https://en.wikipedia.org/wiki/Cristian_Roldan
Already collecte

Already collected!

https://en.wikipedia.org/wiki/Sebastian_Szyma%C5%84ski
Already collected!

https://en.wikipedia.org/wiki/Damian_Szyma%C5%84ski
Already collected!

https://en.wikipedia.org/wiki/Szymon_%C5%BBurkowski
Already collected!

https://en.wikipedia.org/wiki/Krystian_Bielik
Already collected!

https://en.wikipedia.org/wiki/Jakub_Kami%C5%84ski
Already collected!

https://en.wikipedia.org/wiki/Micha%C5%82_Sk%C3%B3ra%C5%9B
Already collected!

https://en.wikipedia.org/wiki/Robert_Lewandowski
Already collected!

https://en.wikipedia.org/wiki/Arkadiusz_Milik
Already collected!

https://en.wikipedia.org/wiki/Krzysztof_Pi%C4%85tek
Already collected!

https://en.wikipedia.org/wiki/Karol_%C5%9Awiderski
Already collected!

https://en.wikipedia.org/wiki/Mohammed_Al-Owais
Already collected!

https://en.wikipedia.org/wiki/Mohammed_Al-Rubaie
Already collected!

https://en.wikipedia.org/wiki/Nawaf_Al-Aqidi
Already collected!

https://en.wikipedia.org/wiki/Yasser_Al-Shahrani
Already collected

Already collected!

https://en.wikipedia.org/wiki/A%C3%AFssa_La%C3%AFdouni
Already collected!

https://en.wikipedia.org/wiki/Mohamed_Ali_Ben_Romdhane
Already collected!

https://en.wikipedia.org/wiki/Aymen_Dahmen
Already collected!

https://en.wikipedia.org/wiki/Ellyes_Skhiri
Already collected!

https://en.wikipedia.org/wiki/Ghailene_Chaalali
Already collected!

https://en.wikipedia.org/wiki/Seifeddine_Jaziri
Already collected!

https://en.wikipedia.org/wiki/Mohamed_Dr%C3%A4ger
Already collected!

https://en.wikipedia.org/wiki/Wajdi_Kechrida
Already collected!

https://en.wikipedia.org/wiki/Bechir_Ben_Sa%C3%AFd
Already collected!

https://en.wikipedia.org/wiki/Na%C3%AFm_Sliti
Already collected!

https://en.wikipedia.org/wiki/Ali_Abdi_(footballer)
Already collected!

https://en.wikipedia.org/wiki/Anis_Ben_Slimane
Already collected!

https://en.wikipedia.org/wiki/Mouez_Hassen
Already collected!

https://en.wikipedia.org/wiki/Keylor_Navas
Already collected!

https://en.wikipedia.org/wiki/

Already collected!

https://en.wikipedia.org/wiki/Timothy_Castagne
Already collected!

https://en.wikipedia.org/wiki/Arthur_Theate
Already collected!

https://en.wikipedia.org/wiki/Zeno_Debast
Already collected!

https://en.wikipedia.org/wiki/Wout_Faes
Already collected!

https://en.wikipedia.org/wiki/Axel_Witsel
Already collected!

https://en.wikipedia.org/wiki/Kevin_De_Bruyne
Already collected!

https://en.wikipedia.org/wiki/Yannick_Carrasco
Already collected!

https://en.wikipedia.org/wiki/Youri_Tielemans
Already collected!

https://en.wikipedia.org/wiki/Leander_Dendoncker
Already collected!

https://en.wikipedia.org/wiki/Hans_Vanaken
Already collected!

https://en.wikipedia.org/wiki/Leandro_Trossard
Already collected!

https://en.wikipedia.org/wiki/Charles_De_Ketelaere
Already collected!

https://en.wikipedia.org/wiki/Amadou_Onana
Already collected!

https://en.wikipedia.org/wiki/Eden_Hazard
Already collected!

https://en.wikipedia.org/wiki/Dries_Mertens
Already collected!

https:/

Already collected!

https://en.wikipedia.org/wiki/Collins_Fai
Already collected!

https://en.wikipedia.org/wiki/Nouhou_Tolo
Already collected!

https://en.wikipedia.org/wiki/Jean-Charles_Castelletto
Already collected!

https://en.wikipedia.org/wiki/Olivier_Mbaizo
Already collected!

https://en.wikipedia.org/wiki/Enzo_Ebosse
Already collected!

https://en.wikipedia.org/wiki/Christopher_Wooh
Already collected!

https://en.wikipedia.org/wiki/Andr%C3%A9-Frank_Zambo_Anguissa
Already collected!

https://en.wikipedia.org/wiki/Pierre_Kunde
Already collected!

https://en.wikipedia.org/wiki/Samuel_Gouet
Already collected!

https://en.wikipedia.org/wiki/Martin_Hongla
Already collected!

https://en.wikipedia.org/wiki/Ga%C3%ABl_Ondoua
Already collected!

https://en.wikipedia.org/wiki/Olivier_Ntcham
Already collected!

https://en.wikipedia.org/wiki/Jerome_Ngom_Mbekeli
Already collected!

https://en.wikipedia.org/wiki/Vincent_Aboubakar
Already collected!

https://en.wikipedia.org/wiki/Eric_Maxim_Chou

Already collected!

https://en.wikipedia.org/wiki/Jo%C3%A3o_F%C3%A9lix
Already collected!

https://en.wikipedia.org/wiki/Rafael_Le%C3%A3o
Already collected!

https://en.wikipedia.org/wiki/Ricardo_Horta
Already collected!

https://en.wikipedia.org/wiki/Gon%C3%A7alo_Ramos
Already collected!

https://en.wikipedia.org/wiki/Kim_Seung-gyu
Already collected!

https://en.wikipedia.org/wiki/Jo_Hyeon-woo
Already collected!

https://en.wikipedia.org/wiki/Song_Bum-keun
Already collected!

https://en.wikipedia.org/wiki/Kim_Young-gwon
Already collected!

https://en.wikipedia.org/wiki/Kim_Jin-su
Already collected!

https://en.wikipedia.org/wiki/Hong_Chul
Already collected!

https://en.wikipedia.org/wiki/Kim_Min-jae_(footballer)
Already collected!

https://en.wikipedia.org/wiki/Kwon_Kyung-won
Already collected!

https://en.wikipedia.org/wiki/Kim_Moon-hwan
Already collected!

https://en.wikipedia.org/wiki/Kim_Tae-hwan_(footballer,_born_1989)
Already collected!

https://en.wikipedia.org/wiki/Cho_Yu-min


https://www.transfermarkt.com/dummy/leistungsdaten/spieler/265481/plus/0?saison=ges
https://www.transfermarkt.com/dummy/leistungsdatenverein/spieler/265481

https://en.wikipedia.org/wiki/Romario_Ibarra
https://www.transfermarkt.com/dummy/profil/spieler/263605
https://www.transfermarkt.com/dummy/leistungsdaten/spieler/263605/plus/0?saison=ges
https://www.transfermarkt.com/dummy/leistungsdatenverein/spieler/263605

https://en.wikipedia.org/wiki/Ayrton_Preciado
https://www.transfermarkt.com/dummy/profil/spieler/212310
https://www.transfermarkt.com/dummy/leistungsdaten/spieler/212310/plus/0?saison=ges
https://www.transfermarkt.com/dummy/leistungsdatenverein/spieler/212310

https://en.wikipedia.org/wiki/Djorkaeff_Reasco
https://www.transfermarkt.com/dummy/profil/spieler/473346
https://www.transfermarkt.com/dummy/leistungsdaten/spieler/473346/plus/0?saison=ges
https://www.transfermarkt.com/dummy/leistungsdatenverein/spieler/473346

https://en.wikipedia.org/wiki/Enner_Valencia
https://www.tra