# Coleta de dados
Nesta seção iremos coletar os dados por meio de web scraping e apis de outros sites

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
prefix = 'https://www.transfermarkt.com'
def getSoup(url):
    res = requests.get(url, headers = headers)
    return BeautifulSoup(res.content, 'html.parser')


### Função para realizar o scraping de uma página de jogador

In [12]:
def getPlayerRow(url):
    page = getSoup(prefix + url)
    span_list = page.find_all('span')
    # infoTable = page.find('div', {'class': 'info-table'})
    # span_list = infoTable.find_all('span')

    currAge = 'N/A'
    height = 'N/A'
    foot = 'N/A'
    currClub = 'N/A'

    for i, span in enumerate(span_list):
        if span.text.strip() == 'Age:':
            currAge = span_list[i+1].text.strip()
        if span.text.strip() == 'Height:':
            height = span_list[i +
                               1].text.replace('m', '').strip().replace(',', '.')
        if span.text.strip() == 'Foot:':
            foot = span_list[i+1].text.strip()
        if span.text.strip() == 'Current club:':
            currClub = span_list[i+1].text.strip()
            
    return ([url, currAge, height, foot, currClub])

In [13]:
def getPlayersTable(playerLinks):
    table = []
    for link in playerLinks:
        table.append(getPlayerRow(link))
    df = pd.DataFrame(
        table, columns=['playerLink', 'currAge', 'height', 'foot', 'currClub'])
    df['currAge'] = pd.to_numeric(df['currAge'], errors='coerce').astype(float)
    df['height'] = pd.to_numeric(df['height'], errors='coerce').astype(float)
    return df

In [14]:
pl_transfers = pd.read_csv('data/pl_transfers.csv')
pl_transfers

Unnamed: 0,playerName,playerLink,playerAge,playerPos,playerCountry,playerMarketValue,transferFee,clubLeft,clubLeftUrl,clubJoined,clubJoinedUrl,teamTransferType,season,valuation
0,Joe Murphy,/joe-murphy/profil/spieler/3655,20,GK,Ireland,50000,250000,Tranmere Rovers,/tranmere-rovers/startseite/verein/1074,West Bromwich Albion,/west-bromwich-albion/startseite/verein/984,out,2002,-200000
1,Cristiano Ronaldo,/cristiano-ronaldo/profil/spieler/8198,18,CF,Portugal,15000000,19000000,Sporting CP,/sporting-lissabon/startseite/verein/336,Manchester United,/manchester-united/startseite/verein/985,out,2003,-4000000
2,Gaël Clichy,/gael-clichy/profil/spieler/7449,18,LB,France,100000,375000,AS Cannes,/as-cannes/startseite/verein/895,Arsenal FC,/fc-arsenal/startseite/verein/11,out,2003,-275000
3,Scott Carson,/scott-carson/profil/spieler/14555,19,GK,England,200000,1400000,Leeds United,/leeds-united/startseite/verein/399,Liverpool FC,/fc-liverpool/startseite/verein/31,out,2004,-1200000
4,James Milner,/james-milner/profil/spieler/3333,18,CM,England,1500000,7400000,Leeds United,/leeds-united/startseite/verein/399,Newcastle United,/newcastle-united/startseite/verein/762,out,2004,-5900000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1887,Pedro Porro,/pedro-porro/profil/spieler/553875,23,RB,Spain,40000000,45000000,Sporting CP,/sporting-lissabon/startseite/verein/336,Tottenham Hotspur,/tottenham-hotspur/startseite/verein/148,out,2023,-5000000
1888,Dejan Kulusevski,/dejan-kulusevski/profil/spieler/431755,23,RW,Sweden,50000000,30000000,Juventus FC,/juventus-turin/startseite/verein/506,Tottenham Hotspur,/tottenham-hotspur/startseite/verein/148,out,2023,20000000
1889,Arthur Masuaku,/arthur-masuaku/profil/spieler/181380,29,LB,DR Congo,6000000,2000000,West Ham United,/west-ham-united/startseite/verein/379,Besiktas JK,/besiktas-istanbul/startseite/verein/114,in,2023,4000000
1890,Boubacar Traoré,/boubacar-traore/profil/spieler/649020,21,DM,Mali,5000000,11000000,FC Metz,/fc-metz/startseite/verein/347,Wolverhampton Wanderers,/wolverhampton-wanderers/startseite/verein/543,out,2023,-6000000


In [18]:
player_links = pl_transfers['playerLink'].unique()

pl_players = getPlayersTable(player_links)
pl_players

Unnamed: 0,playerLink,currAge,height,foot,currClub
0,/joe-murphy/profil/spieler/3655,41.0,1.88,,Tranmere Rovers
1,/cristiano-ronaldo/profil/spieler/8198,38.0,1.87,right,Al-Nassr FC
2,/gael-clichy/profil/spieler/7449,37.0,1.76,left,Servette FC
3,/scott-carson/profil/spieler/14555,37.0,1.88,right,Manchester City
4,/james-milner/profil/spieler/3333,37.0,1.75,right,Liverpool FC
...,...,...,...,...,...
1182,/christopher-nkunku/profil/spieler/344381,25.0,1.78,right,RB Leipzig
1183,/yankuba-minteh/profil/spieler/1012534,18.0,,left,Odense Boldklub
1184,/dejan-kulusevski/profil/spieler/431755,23.0,1.86,left,Tottenham Hotspur
1185,/boubacar-traore/profil/spieler/649020,21.0,1.83,,Wolverhampton Wanderers


In [23]:
merged = pd.merge(pl_players, pl_transfers[['playerLink', 'playerPos' ,'playerCountry', 'playerMarketValue']], on='playerLink', how='left')
merged = merged.drop_duplicates(subset=['playerLink'])
pl_players = merged

In [25]:
pl_players.to_csv('data/pl_players.csv', index=False)