# Coleta de dados
Nesta seção iremos coletar os dados por meio de web scraping e apis de outros sites

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from tqdm import tqdm

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
prefix = 'https://www.transfermarkt.com'
def getSoup(url):
    res = requests.get(url, headers = headers)
    return BeautifulSoup(res.content, 'html.parser')


### Função para realizar o scraping de uma página de jogador

In [2]:
def getPlayerRow(url):
    page = getSoup(prefix + url)
    span_list = page.find_all('span')
    # infoTable = page.find('div', {'class': 'info-table'})
    # span_list = infoTable.find_all('span')

    currAge = 'N/A'
    height = 'N/A'
    foot = 'N/A'
    currClub = 'N/A'

    for i, span in enumerate(span_list):
        if span.text.strip() == 'Age:':
            currAge = span_list[i+1].text.strip()
        if span.text.strip() == 'Height:':
            height = span_list[i + 1].text.replace('m', '').strip().replace(',', '.')
        if span.text.strip() == 'Foot:':
            foot = span_list[i+1].text.strip()
        if span.text.strip() == 'Current club:':
            currClub = span_list[i+1].text.strip()
            
    return ([url, currAge, height, foot, currClub])

In [3]:
def getPlayersTable(playerLinks):
    table = []
    total_iterations = len(playerLinks)
    for link in tqdm(playerLinks, total=total_iterations):
        table.append(getPlayerRow(link))
    df = pd.DataFrame(
        table, columns=['playerLink', 'currAge', 'height', 'foot', 'currClub'])
    df['currAge'] = pd.to_numeric(df['currAge'], errors='coerce').astype(float)
    df['height'] = pd.to_numeric(df['height'], errors='coerce').astype(float)
    return df

In [4]:
pl_transfers = pd.read_csv('data/transfers/pl_transfers.csv')
ll_transfers = pd.read_csv('data/transfers/ll_transfers.csv')
bl_transfers = pd.read_csv('data/transfers/bl_transfers.csv')
sa_transfers = pd.read_csv('data/transfers/sa_transfers.csv')
l1_transfers = pd.read_csv('data/transfers/l1_transfers.csv')

In [6]:
def getPlayersInfo(name, df):
    player_links = df['playerLink'].unique()
    players = getPlayersTable(player_links)
    merged = pd.merge(players, df[['playerLink', 'playerName', 'playerPos' ,'playerCountry', 'playerMarketValue']], on='playerLink', how='left')
    merged = merged.drop_duplicates(subset=['playerLink'])
    players = merged
    players.to_csv('data/players/' + name, index=False)

In [6]:
getPlayersInfo("pl_players.csv", pl_transfers)

In [7]:
getPlayersInfo("ll_players.csv", ll_transfers)

In [None]:
getPlayersInfo("bl_players.csv", bl_transfers)

In [None]:
getPlayersInfo("sa_players.csv", sa_transfers)

In [7]:
getPlayersInfo("l1_players.csv", l1_transfers)

100%|██████████| 1706/1706 [47:21<00:00,  1.67s/it]  
