# Precificando Jogadores de Futebol

# Coleta de dados
Nesta seção iremos coletar os dados por meio de web scraping e apis de outros sites

In [140]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
prefix = 'https://www.transfermarkt.com'
def getSoup(url):
    res = requests.get(url, headers = headers)
    return BeautifulSoup(res.content, 'html.parser')

### Função para realizar o web scraping das tableas de transferências no transfermarkt

In [150]:
import re

def getClubUrl(url):
    url = url.replace('/transfers', '')  # Remove a substring '/transfers'
    match = re.search(r'/verein/(\d+)', url)
    verein_id = match.group(1)  # Obtém o número após "/verein/"
    new_url = match.string[:match.start()] + '/startseite' '/verein/' + verein_id
    return new_url
    

def eurValue(string):
    padrao = r'^€.*[mk]$'  # Expressão regular para verificar se começa com '€' e termina com 'm' ou 'k'
    if re.match(padrao, string):
        return True
    else:
        return False

def eurToInt(string):
    valor_str = re.sub(r'[€mk]', '', string)  # Remove '€', 'm' e 'k' da string
    valor_float = float(valor_str.replace(',', '.'))  # Substitui a vírgula pelo ponto como separador decimal e converte para float
    if 'm' in string:
        valor_int = int(valor_float * 1000000)  # Converte para milhões
    elif 'k' in string:
        valor_int = int(valor_float * 1000)  # Converte para milhares
    else:
        valor_int = int(valor_float)
    return valor_int

def getTransfersFromUrl(url, year):
    transferTable = []
    page = getSoup(url.format(year))
    tables = page.find_all('div', {'class': 'box'})
    nTeams = int(page.find(lambda tag: tag.name == "li" and "Number of teams:" in tag.text).find('span').text.split()[0])
    tables = tables[3:nTeams+3]
    for table in tables:
        currTeam = getClubUrl(table.find('a').get('href'))
        arrivalsAndDepartures = table.find_all('table')
        arrivals = arrivalsAndDepartures[0].find_all('tr', recursive=True)
        arrivals = arrivals[1:]
        for transfer in arrivals:
            transfer = transfer.find_all('td')
            player = transfer[0].find('a')
            if isinstance(player, type(None)):
                continue
            playerName = player.text
            playerLink = player.get('href')
            playerAge = transfer[1].text
            playerPos = transfer[4].text
            playerCountry = transfer[2].find('img').get('alt')
            transferFee = transfer[8].text
            if(not eurValue(transferFee)):
                continue
            if(not eurValue(transfer[5].text)):
                playerMarketValue = 0
            else:
                playerMarketValue = eurToInt(transfer[5].text)
            transferFee = eurToInt(transferFee)
            clubLeft = getClubUrl(transfer[6].find('a').get('href'))
            clubJoined = currTeam

            transferTable.append([playerName, playerLink, playerAge, playerPos, playerCountry,
                                playerMarketValue, transferFee, clubLeft, clubJoined])
        
        departures = arrivalsAndDepartures[1].find_all('tr', recursive=True)
        departures = departures[1:]
        for transfer in departures:
            transfer = transfer.find_all('td')
            player = transfer[0].find('a')
            if isinstance(player, type(None)):
                continue
            playerName = player.text
            playerLink = player.get('href')
            playerAge = transfer[1].text
            playerPos = transfer[4].text
            playerCountry = transfer[2].find('img').get('alt')
            transferFee = transfer[8].text
            if(not eurValue(transferFee)):
                continue
            if(not eurValue(transfer[5].text)):
                playerMarketValue = 0
            else:
                playerMarketValue = eurToInt(transfer[5].text)
            transferFee = eurToInt(transferFee)
            clubJoined =  getClubUrl(transfer[6].find('a').get('href'))
            clubLeft = currTeam
            
            transferTable.append([playerName, playerLink, playerAge, playerPos, playerCountry,
                                playerMarketValue, transferFee, clubLeft, clubJoined])

    columns = ['playerName', 'playerLink', 'playerAge', 'playerPos', 'playerCountry',
               'playerMarketValue', 'transferFee', 'clubLeft', 'clubJoined']
    final_df = pd.DataFrame(transferTable, columns=columns)
    final_df['season'] = year
    return final_df

### Função para fazer o web scraping das tabelas de transferências de um determinado período e concatenar elas

In [149]:
def getTransfersInRange(begin, end, url):
    curr_df = getTransfersFromUrl(url, begin)
    for i in range(begin+1, end+1):
        curr_df = pd.concat([curr_df, getTransfersFromUrl(url, i)]).reset_index(drop=True)
    return curr_df

### Obtendo dados das transferências para um range de temporadas da Premier League (demora 3 a 6 segundos por temporada, depende da internet)

In [151]:
pl = r'https://www.transfermarkt.com/premier-league/transfers/wettbewerb/GB1/plus/?saison_id={}&s_w=&leihe=0&intern=0'
pl_transfers = getTransfersInRange(2000, 2023, pl)
pl_transfers

Unnamed: 0,playerName,playerLink,playerAge,playerPos,playerCountry,playerMarketValue,transferFee,clubLeft,clubJoined,season
0,Rio Ferdinand,/rio-ferdinand/profil/spieler/3235,22,CB,England,0,26000000,/west-ham-united/startseite/verein/379,/leeds-united/startseite/verein/399,2000
1,Olivier Dacourt,/olivier-dacourt/profil/spieler/3572,25,DM,France,0,10500000,/rc-lens/startseite/verein/826,/leeds-united/startseite/verein/399,2000
2,Mark Viduka,/mark-viduka/profil/spieler/3242,24,CF,Australia,0,9150000,/celtic-glasgow/startseite/verein/371,/leeds-united/startseite/verein/399,2000
3,Dominic Matteo,/dominic-matteo/profil/spieler/3765,26,LB,Scotland,0,7130000,/fc-liverpool/startseite/verein/31,/leeds-united/startseite/verein/399,2000
4,Jacob Burns,/jacob-burns/profil/spieler/4096,22,DM,Australia,0,375000,/parramatta-power/startseite/verein/2500,/leeds-united/startseite/verein/399,2000
...,...,...,...,...,...,...,...,...,...,...
4052,Chris Wood,/chris-wood/profil/spieler/108725,31,CF,New Zealand,10000000,17000000,/newcastle-united/startseite/verein/762,/nottingham-forest/startseite/verein/703,2023
4053,Chris Wood,/chris-wood/profil/spieler/108725,31,CF,New Zealand,10000000,17000000,/newcastle-united/startseite/verein/762,/nottingham-forest/startseite/verein/703,2023
4054,Pedro Porro,/pedro-porro/profil/spieler/553875,23,RB,Spain,35000000,45000000,/sporting-lissabon/startseite/verein/336,/tottenham-hotspur/startseite/verein/148,2023
4055,Arthur Masuaku,/arthur-masuaku/profil/spieler/181380,29,LB,DR Congo,6000000,2000000,/west-ham-united/startseite/verein/379,/besiktas-istanbul/startseite/verein/114,2023


# Análise exploratória dos dados
Nesta seção iremos apresentar análises e visualizações dos dados para entender melhor as relações entre variáveis, padrões relevantes e bons insights

# Pré-processamento dos dados
Nesta seção realizaremos o tratamento dos dados, removendo outliers, normalizando, padronizando e transformando os dados

# Construção do modelo
Nesta seção abordaremos a construção de um modelo de precificação dos jogadores

## Divisão dos dados
Nessa subseção iremos separar os dados em um conjunto de treinamento e um conjunto de teste