In [1]:
from bs4 import BeautifulSoup
import pandas as pd
from pprint import pprint
import re
import requests
from tqdm.notebook import tqdm
import time

In [2]:
# The ones which were already downloaded
df_old = pd.read_csv("../output/2022-wikipedia-to-transfermarkt.csv")

In [3]:
df_old.head()

Unnamed: 0,wikipedia_page,transfermarkt_id,transfermarkt_name
0,https://en.wikipedia.org/wiki/Remko_Pasveer,25520,Remko Pasveer
1,https://en.wikipedia.org/wiki/Jurri%C3%ABn_Timber,420243,Jurrien Timber
2,https://en.wikipedia.org/wiki/Matthijs_de_Ligt,326031,Matthijs de Ligt
3,https://en.wikipedia.org/wiki/Virgil_van_Dijk,139208,Virgil van Dijk
4,https://en.wikipedia.org/wiki/Nathan_Ak%C3%A9,177476,Nathan Aké


In [4]:
# Request
url = 'https://en.wikipedia.org/wiki/2022_FIFA_World_Cup_squads'
r = requests.get(url)
soup = BeautifulSoup(r.text)

In [5]:
# Find the tables
tables = soup.find_all('table')

In [6]:
# Check if the tables have the wanted columns
tables = [table for table in tables if len(table.find_all("abbr")) == 2] # only player tables have 2 abbr
len(tables)

32

In [7]:
# For each table, find all the player links
player_links = []
for table in tables:
    players = table.find_all("tr", class_='nat-fs-player')
    
    for player in players:
        link = player.find('th').find('a')['href']
        player_links.append(link)

In [8]:
# After finding all players, we will retrieve the wikidata entry for each one of them
wikibase_list = []
for index, player_url in enumerate(tqdm(player_links)):
    
    query = f"https://en.wikipedia.org{player_url}"
    
    if query in df_old.wikipedia_page.unique().tolist():
        continue
    
    print(query)
    
    r = requests.get(query)
    soup = BeautifulSoup(r.text)
        
    wikibase_url = soup.find("li", {"id": "t-wikibase"}).find('a')['href']
    wikibase_item = re.search("\d+", wikibase_url).group(0)
    

    datapoint = {"query": query, 
                     "wikibase_url": wikibase_url, 
                     "wikibase_item": wikibase_item}
                              
    print(datapoint)
    
    wikibase_list.append(datapoint)

  0%|          | 0/831 [00:00<?, ?it/s]

https://en.wikipedia.org/wiki/Alexander_Dom%C3%ADnguez
{'query': 'https://en.wikipedia.org/wiki/Alexander_Dom%C3%ADnguez', 'wikibase_url': 'https://www.wikidata.org/wiki/Special:EntityPage/Q2667325', 'wikibase_item': '2667325'}
https://en.wikipedia.org/wiki/Hern%C3%A1n_Gal%C3%ADndez
{'query': 'https://en.wikipedia.org/wiki/Hern%C3%A1n_Gal%C3%ADndez', 'wikibase_url': 'https://www.wikidata.org/wiki/Special:EntityPage/Q5742434', 'wikibase_item': '5742434'}
https://en.wikipedia.org/wiki/Mois%C3%A9s_Ram%C3%ADrez
{'query': 'https://en.wikipedia.org/wiki/Mois%C3%A9s_Ram%C3%ADrez', 'wikibase_url': 'https://www.wikidata.org/wiki/Special:EntityPage/Q60842106', 'wikibase_item': '60842106'}
https://en.wikipedia.org/wiki/Robert_Arboleda
{'query': 'https://en.wikipedia.org/wiki/Robert_Arboleda', 'wikibase_url': 'https://www.wikidata.org/wiki/Special:EntityPage/Q22082660', 'wikibase_item': '22082660'}
https://en.wikipedia.org/wiki/Xavier_Arreaga
{'query': 'https://en.wikipedia.org/wiki/Xavier_Arreaga

In [9]:
dataset = []
scrapped = []

In [10]:
# For each player,get their transfermarkt id from wikidata
for wikibase_item in tqdm(wikibase_list):
    
    wikibase_url = wikibase_item['wikibase_url']
    wikipedia_page = wikibase_item['query']
    wikibase_id = wikibase_item['wikibase_item']
    
    if wikipedia_page in df_old.wikipedia_page.unique().tolist():
        continue
        
    print(wikibase_id)
    
    r = requests.get(wikibase_url)
    soup = BeautifulSoup(r.text)

    div = soup.find("div", {"data-property-id": "P2446"})
    transfermarkt_id = div.find("a", class_="wb-external-id external")

    try:
        transfermarkt_name = div.find_all("div", class_="wikibase-snakview-value wikibase-snakview-variation-valuesnak")[1].text
    except IndexError:
        transfermarkt_name = 'not on wikidata'

    row = {
        'wikipedia_page': wikipedia_page,
        'transfermarkt_id': transfermarkt_id.text,
        'transfermarkt_name': transfermarkt_name
    }
    
    print(row)
    
    dataset.append(row)
    scrapped.append(wikibase_id)

  0%|          | 0/26 [00:00<?, ?it/s]

2667325
{'wikipedia_page': 'https://en.wikipedia.org/wiki/Alexander_Dom%C3%ADnguez', 'transfermarkt_id': '84310', 'transfermarkt_name': 'Alexander Domínguez'}
5742434
{'wikipedia_page': 'https://en.wikipedia.org/wiki/Hern%C3%A1n_Gal%C3%ADndez', 'transfermarkt_id': '77127', 'transfermarkt_name': 'not on wikidata'}
60842106
{'wikipedia_page': 'https://en.wikipedia.org/wiki/Mois%C3%A9s_Ram%C3%ADrez', 'transfermarkt_id': '450233', 'transfermarkt_name': 'not on wikidata'}
22082660
{'wikipedia_page': 'https://en.wikipedia.org/wiki/Robert_Arboleda', 'transfermarkt_id': '139867', 'transfermarkt_name': 'Robert Arboleda'}
19406042
{'wikipedia_page': 'https://en.wikipedia.org/wiki/Xavier_Arreaga', 'transfermarkt_id': '323854', 'transfermarkt_name': 'Xavier Arreaga'}
20994068
{'wikipedia_page': 'https://en.wikipedia.org/wiki/Pervis_Estupi%C3%B1%C3%A1n', 'transfermarkt_id': '349599', 'transfermarkt_name': 'Pervis Estupiñán'}
101053044
{'wikipedia_page': 'https://en.wikipedia.org/wiki/Piero_Hincapi%

In [11]:
df = pd.DataFrame(dataset)

In [12]:
df = pd.concat([df_old, df])

In [13]:
df.to_csv("../output/2022-wikipedia-to-transfermarkt.csv", index=False)