# Scraping das transferências por liga
Nesta seção serão obtidas todas as transferências de 2000 a 2023 para cada liga

Transferências entre clubes da mesma liga serão consideradas apenas uma transferência, com 'teamTransferType' = 'out'

Transferências são salvas no diretório 'data/transfers/{liga}_transfers.csv'

O arquivo 'data/transfers/all_transfers.csv' contém a concatenação das transferências de todas as ligas, removendo transferências repetidas

Dataframes de transferências têm o tipo abaixo:

In [2]:
import pandas as pd
pd.read_csv('data/transfers/all_transfers.csv')

Unnamed: 0,playerName,playerLink,playerAge,playerPos,playerCountry,playerMarketValue,transferFee,clubLeft,clubLeftUrl,clubJoined,clubJoinedUrl,teamTransferType,season,valuation
0,Mark Viduka,/mark-viduka/profil/spieler/3242,24,CF,Australia,0,9150000,Celtic FC,/celtic-glasgow/startseite/verein/371,Leeds United,/leeds-united/startseite/verein/399,in,2000,-9150000
1,Jacob Burns,/jacob-burns/profil/spieler/4096,22,DM,Australia,0,375000,Parramatta Power,/parramatta-power/startseite/verein/2500,Leeds United,/leeds-united/startseite/verein/399,in,2000,-375000
2,Lee Matthews,/lee-matthews/profil/spieler/20977,22,CM,England,0,150000,Leeds United,/leeds-united/startseite/verein/399,Bristol City,/bristol-city/startseite/verein/698,out,2000,-150000
3,David Hopkin,/david-hopkin/profil/spieler/107198,29,RM,Scotland,0,3750000,Leeds United,/leeds-united/startseite/verein/399,Bradford City,/bradford-city/startseite/verein/1027,in,2000,-3750000
4,Ashley Ward,/ashley-ward/profil/spieler/13548,29,CF,England,0,2100000,Blackburn Rovers,/blackburn-rovers/startseite/verein/164,Bradford City,/bradford-city/startseite/verein/1027,in,2000,-2100000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12978,Ibrahim Cissoko,/ibrahim-cissoko/profil/spieler/575693,20,LW,Netherlands,2500000,3000000,NEC Nijmegen,/nec-nijmegen/startseite/verein/467,FC Toulouse,/fc-toulouse/startseite/verein/415,in,2023,-500000
12979,Mahdi Camara,/mahdi-camara/profil/spieler/324795,25,CM,France,3000000,3000000,AS Saint-Étienne,/as-saint-etienne/startseite/verein/618,Stade Brestois 29,/stade-brest-29/startseite/verein/3911,in,2023,0
12980,Mostafa Mohamed,/mostafa-mohamed/profil/spieler/462348,25,CF,Egypt,6000000,5750000,Galatasaray,/galatasaray-istanbul/startseite/verein/141,FC Nantes,/fc-nantes/startseite/verein/995,in,2023,250000
12981,Andy Delort,/andy-delort/profil/spieler/122797,31,CF,Algeria,7000000,5000000,OGC Nice,/ogc-nizza/startseite/verein/417,FC Nantes,/fc-nantes/startseite/verein/995,in,2023,2000000


In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
prefix = 'https://www.transfermarkt.com'
def getSoup(url):
    res = requests.get(url, headers = headers)
    return BeautifulSoup(res.content, 'html.parser')


### Função para realizar o web scraping das tableas de transferências no transfermarkt

In [30]:
def getClubUrl(url):
    url = url.replace('/transfers', '')  # Remove a substring '/transfers'
    match = re.search(r'/verein/(\d+)', url)
    if(not match):
        return "invalid";
    verein_id = match.group(1)  # Obtém o número após "/verein/"
    new_url = match.string[:match.start()] + \
        '/startseite' '/verein/' + verein_id
    return new_url


def eurValue(string):
    # Expressão regular para verificar se começa com '€' e termina com 'm' ou 'k'
    padrao = r'^€.*[mk]$'
    if re.match(padrao, string):
        return True
    else:
        return False


def eurToInt(string):
    valor_str = re.sub(r'[€mk]', '', string)  # Remove '€', 'm' e 'k' da string
    # Substitui a vírgula pelo ponto como separador decimal e converte para float
    valor_float = float(valor_str.replace(',', '.'))
    if 'm' in string:
        valor_int = int(valor_float * 1000000)  # Converte para milhões
    elif 'k' in string:
        valor_int = int(valor_float * 1000)  # Converte para milhares
    else:
        valor_int = int(valor_float)
    return valor_int


def getTransfersFromUrl(url, year):
    transferTable = []
    page = getSoup(url.format(year))
    tables = page.find_all('div', {'class': 'box'})
    tables = tables[3:]
    for table in tables:
        currTeamUrl = getClubUrl(table.find('a').get('href'))
        if(currTeamUrl == "invalid"):
            break;
        currTeam = table.find('a').get('title')
        arrivalsAndDepartures = table.find_all('table')

        arrivals = arrivalsAndDepartures[0].find_all('tr', recursive=True)
        arrivals = arrivals[1:]
        for transfer in arrivals:
            transfer = transfer.find_all('td')
            player = transfer[0].find('a')
            if isinstance(player, type(None)):
                continue
            playerName = player.text
            playerLink = player.get('href')
            playerAge = transfer[1].text
            playerPos = transfer[4].text
            if(transfer[2].find('img')):
                playerCountry = transfer[2].find('img').get('alt')
            else:
                playerCountry = "none"
            transferFee = transfer[8].text
            if (not eurValue(transferFee)):
                continue
            if (not eurValue(transfer[5].text)):
                playerMarketValue = 0
            else:
                playerMarketValue = eurToInt(transfer[5].text)
            transferFee = eurToInt(transferFee)
            if (transferFee < 10):
                continue
            clubLeftUrl = getClubUrl(transfer[6].find('a').get('href'))
            clubJoinedUrl = currTeamUrl
            clubLeft = transfer[6].find('a').get('title')
            clubJoined = currTeam
            teamTransferType = "in"

            transferTable.append([playerName, playerLink, playerAge, playerPos, playerCountry,
                                  playerMarketValue, transferFee, clubLeft, clubLeftUrl, clubJoined, clubJoinedUrl, teamTransferType])

        departures = arrivalsAndDepartures[1].find_all(
            'tr', recursive=True)
        departures = departures[1:]
        for transfer in departures:
            transfer = transfer.find_all('td')
            player = transfer[0].find('a')
            if isinstance(player, type(None)):
                continue
            playerName = player.text
            playerLink = player.get('href')
            playerAge = transfer[1].text
            playerPos = transfer[4].text

            if(transfer[2].find('img')):
                playerCountry = transfer[2].find('img').get('alt')
            else:
                playerCountry = "none"
            transferFee = transfer[8].text
            if (not eurValue(transferFee)):
                continue
            if (not eurValue(transfer[5].text)):
                playerMarketValue = 0
            else:
                playerMarketValue = eurToInt(transfer[5].text)
            transferFee = eurToInt(transferFee)
            if (transferFee < 10):
                continue
            clubJoinedUrl = getClubUrl(transfer[6].find('a').get('href'))
            clubLeftUrl = currTeamUrl
            clubLeft = currTeam
            clubJoined = transfer[6].find('a').get('title')
            teamTransferType = "out"

            transferTable.append([playerName, playerLink, playerAge, playerPos, playerCountry,
                                  playerMarketValue, transferFee, clubLeft, clubLeftUrl, clubJoined, clubJoinedUrl, teamTransferType])

    columns = ['playerName', 'playerLink', 'playerAge', 'playerPos', 'playerCountry',
               'playerMarketValue', 'transferFee', 'clubLeft', 'clubLeftUrl', 'clubJoined', 'clubJoinedUrl', 'teamTransferType']
    final_df = pd.DataFrame(transferTable, columns=columns)
    final_df['season'] = year
    final_df['playerAge'] = final_df['playerAge'].apply(
        lambda x: re.sub(r'\D+', '', str(x)))
    final_df['playerAge'] = final_df['playerAge'].astype(int)
    final_df['valuation'] = final_df['playerMarketValue'] - final_df['transferFee']
    return final_df

### Função para fazer o web scraping das tabelas de transferências de um determinado período e concatenar elas

In [31]:
def getTransfersInRange(begin, end, url):
    curr_df = getTransfersFromUrl(url, begin)
    for i in range(begin+1, end+1):
        curr_df = pd.concat(
            [curr_df, getTransfersFromUrl(url, i)]).reset_index(drop=True)
    return curr_df

### Obtendo dados das transferências para um range de temporadas da Premier League (demora 3 a 6 segundos por temporada, depende da internet)

In [26]:
pl = 'https://www.transfermarkt.com/premier-league/transfers/wettbewerb/GB1/plus/?saison_id={}&s_w=&leihe=0&intern=0'
ll = 'https://www.transfermarkt.com/laliga/transfers/wettbewerb/ES1/plus/?saison_id={}&s_w=&leihe=0&intern=0'
bl = 'https://www.transfermarkt.com/bundesliga/transfers/wettbewerb/L1/plus/?saison_id={}&s_w=&leihe=0&intern=0'
sa = 'https://www.transfermarkt.com/serie-a/transfers/wettbewerb/IT1/plus/?saison_id={}&s_w=&leihe=0&intern=0'
l1 = 'https://www.transfermarkt.com/ligue-1/transfers/wettbewerb/FR1/plus/?saison_id={}&s_w=&leihe=0&intern=0'

In [27]:
pl_transfers = getTransfersInRange(2000, 2023, pl)
ll_transfers = getTransfersInRange(2000, 2023, ll)
bl_transfers = getTransfersInRange(2000, 2023, bl)
sa_transfers = getTransfersInRange(2000, 2023, sa)
l1_transfers = getTransfersInRange(2000, 2023, l1)

In [28]:
pl_transferss = pl_transfers.drop_duplicates(subset=['playerLink', 'playerAge', 'transferFee'], keep='last')
pl_transferss.to_csv('data/transfers/pl_transfers.csv', index=False)

ll_transferss = ll_transfers.drop_duplicates(subset=['playerLink', 'playerAge', 'transferFee'], keep='last')
ll_transferss.to_csv('data/transfers/ll_transfers.csv', index=False)

bl_transferss = bl_transfers.drop_duplicates(subset=['playerLink', 'playerAge', 'transferFee'], keep='last')
bl_transferss.to_csv('data/transfers/bl_transfers.csv', index=False)

sa_transferss = sa_transfers.drop_duplicates(subset=['playerLink', 'playerAge', 'transferFee'], keep='last')
sa_transferss.to_csv('data/transfers/sa_transfers.csv', index=False)

l1_transferss = l1_transfers.drop_duplicates(subset=['playerLink', 'playerAge', 'transferFee'], keep='last')
l1_transferss.to_csv('data/transfers/l1_transfers.csv', index=False)

In [3]:
leagues = ["pl", "ll", "bl", "sa", "l1"]
dfs = []
for league in leagues:
    dfs.append(pd.read_csv('data/transfers/' +league + '_transfers.csv'))
all_transfers = pd.concat(dfs, axis=0)

all_transfers = all_transfers.drop_duplicates(subset=['playerLink', 'playerAge', 'transferFee', 'clubLeft', 'clubJoined'], keep='last')

all_transfers.to_csv('data/transfers/all_transfers.csv', index=False)