# Scraping de players por liga
Nesta seção serão obtidos todos os jogadores envolvidos em todas as transferências de 2000 a 2023 para cada liga

Informações dos jogadores são salvas no diretório 'data/players/{liga}_players.csv'

O arquivo 'data/players/all_players.csv' contém a concatenação dos players de todas as ligas, removendo jogadores repetidos

Têm o tipo abaixo:

In [5]:
import pandas as pd
pd.read_csv('data/players/all_players.csv')

Unnamed: 0,playerLink,currAge,height,foot,currClub,playerName,playerPos,playerCountry,playerMarketValue
0,/mark-viduka/profil/spieler/3242,47.0,1.88,both,Retired,Mark Viduka,CF,Australia,0
1,/jacob-burns/profil/spieler/4096,45.0,1.78,left,Retired,Jacob Burns,DM,Australia,0
2,/lee-matthews/profil/spieler/20977,44.0,1.88,right,Unknown,Lee Matthews,CM,England,0
3,/david-hopkin/profil/spieler/107198,52.0,1.83,right,Retired,David Hopkin,RM,Scotland,0
4,/ashley-ward/profil/spieler/13548,52.0,1.85,right,Retired,Ashley Ward,CF,England,0
...,...,...,...,...,...,...,...,...,...
8082,/hugo-ekitike/profil/spieler/709726,21.0,1.89,right,Paris Saint-Germain,Hugo Ekitiké,CF,France,20000000
8083,/falaye-sacko/profil/spieler/364857,28.0,1.79,right,Montpellier HSC,Falaye Sacko,RB,Mali,4000000
8084,/ibrahim-cissoko/profil/spieler/575693,20.0,,right,FC Toulouse,Ibrahim Cissoko,LW,Netherlands,2500000
8085,/mahdi-camara/profil/spieler/324795,25.0,1.78,right,Stade Brestois 29,Mahdi Camara,CM,France,3000000


In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from tqdm import tqdm

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
prefix = 'https://www.transfermarkt.com'
def getSoup(url):
    res = requests.get(url, headers = headers)
    return BeautifulSoup(res.content, 'html.parser')


### Função para realizar o scraping de uma página de jogador

In [2]:
def getPlayerRow(url):
    page = getSoup(prefix + url)
    span_list = page.find_all('span')
    # infoTable = page.find('div', {'class': 'info-table'})
    # span_list = infoTable.find_all('span')

    currAge = 'N/A'
    height = 'N/A'
    foot = 'N/A'
    currClub = 'N/A'

    for i, span in enumerate(span_list):
        if span.text.strip() == 'Age:':
            currAge = span_list[i+1].text.strip()
        if span.text.strip() == 'Height:':
            height = span_list[i + 1].text.replace('m', '').strip().replace(',', '.')
        if span.text.strip() == 'Foot:':
            foot = span_list[i+1].text.strip()
        if span.text.strip() == 'Current club:':
            currClub = span_list[i+1].text.strip()
            
    return ([url, currAge, height, foot, currClub])

In [3]:
def getPlayersTable(playerLinks):
    table = []
    total_iterations = len(playerLinks)
    for link in tqdm(playerLinks, total=total_iterations):
        table.append(getPlayerRow(link))
    df = pd.DataFrame(
        table, columns=['playerLink', 'currAge', 'height', 'foot', 'currClub'])
    df['currAge'] = pd.to_numeric(df['currAge'], errors='coerce').astype(float)
    df['height'] = pd.to_numeric(df['height'], errors='coerce').astype(float)
    return df

In [4]:
pl_transfers = pd.read_csv('data/transfers/pl_transfers.csv')
ll_transfers = pd.read_csv('data/transfers/ll_transfers.csv')
bl_transfers = pd.read_csv('data/transfers/bl_transfers.csv')
sa_transfers = pd.read_csv('data/transfers/sa_transfers.csv')
l1_transfers = pd.read_csv('data/transfers/l1_transfers.csv')

In [8]:
pd.read_csv('data/players/pl_players.csv')

Unnamed: 0,playerLink,currAge,height,foot,currClub,playerName,playerPos,playerCountry,playerMarketValue
0,/olivier-dacourt/profil/spieler/3572,48.0,1.75,both,Retired,Olivier Dacourt,DM,France,0
1,/mark-viduka/profil/spieler/3242,47.0,1.88,both,Retired,Mark Viduka,CF,Australia,0
2,/jacob-burns/profil/spieler/4096,45.0,1.78,left,Retired,Jacob Burns,DM,Australia,0
3,/lee-matthews/profil/spieler/20977,44.0,1.88,right,Unknown,Lee Matthews,CM,England,0
4,/david-hopkin/profil/spieler/107198,52.0,1.83,right,Retired,David Hopkin,RM,Scotland,0
...,...,...,...,...,...,...,...,...,...
2389,/guglielmo-vicario/profil/spieler/286047,26.0,1.94,right,Tottenham Hotspur,Guglielmo Vicario,GK,Italy,16000000
2390,/harry-winks/profil/spieler/249126,27.0,1.78,right,Leicester City,Harry Winks,DM,England,10000000
2391,/boubacar-traore/profil/spieler/649020,21.0,1.83,,Wolverhampton Wanderers,Boubacar Traoré,DM,Mali,5000000
2392,/ruben-neves/profil/spieler/225161,26.0,1.80,right,Al-Hilal SFC,Rúben Neves,DM,Portugal,40000000


In [6]:
def getPlayersInfo(name, df):
    player_links = df['playerLink'].unique()
    players = getPlayersTable(player_links)
    merged = pd.merge(players, df[['playerLink', 'playerName', 'playerPos' ,'playerCountry', 'playerMarketValue']], on='playerLink', how='left')
    merged = merged.drop_duplicates(subset=['playerLink'])
    players = merged
    players.to_csv('data/players/' + name, index=False)

In [6]:
getPlayersInfo("pl_players.csv", pl_transfers)

In [7]:
getPlayersInfo("ll_players.csv", ll_transfers)

In [None]:
getPlayersInfo("bl_players.csv", bl_transfers)

In [None]:
getPlayersInfo("sa_players.csv", sa_transfers)

In [7]:
getPlayersInfo("l1_players.csv", l1_transfers)

100%|██████████| 1706/1706 [47:21<00:00,  1.67s/it]  


In [3]:
leagues = ["pl", "ll", "bl", "sa", "l1"]
dfs = []
for league in leagues:
    dfs.append(pd.read_csv('data/players/' +league+ '_players.csv'))
all_players = pd.concat(dfs, axis=0)

all_players = all_players.drop_duplicates(subset=['playerLink'], keep='last')

all_players.to_csv('data/players/all_players.csv', index=False)