In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import warnings
import hashlib

# Suprimindo os warnings temporariamente
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
# CONSTANTS
DELAY = 3
SEASON_RANGE = list(range(2023, 2021, -1))
COMP = "Premier-League" # IMPORTANTE (A URL ABAIXO ESTÁ DINAMICAMENTE PEGANDO ESSA CONSTANTE)
URL = f"https://fbref.com/en/comps/9/{COMP}-Stats" # URL constante (usada como base) da premier league, alterar para o Brasileirão, se necessário

In [3]:
url = URL # url não constante, pois será alterada ao longo do fluxo

In [4]:
def create_player_id(row):
    player_name = row['Player']
    age = row['Age']
    nation = row['Nation']
    
    # Concatenar informações únicas
    player_info = f"{player_name}_{age}_{nation}"
    
    # Calcular o hash MD5 para criar um ID único
    player_id = hashlib.md5(player_info.encode()).hexdigest()
    
    return player_id

def create_team_id(row):
    team_name = row['Team']
    
    # Concatenar informações únicas
    team_info = f"{team_name}"
    
    # Calcular o hash MD5 para criar um ID único
    player_id = hashlib.md5(team_info.encode()).hexdigest()
    
    return player_id

**VAMOS CAPTURAR AS TABELAS GERAIS DE JOGADORES E TIMES** 

*1 - CAPTURANDO A TABELA DE INFORMAÇÕES GERAIS DOS JOGADORES (Standard Stats)*

In [5]:
teams_players_data_table = []
players_table = []
players_performance = []

for year in SEASON_RANGE:
    data = requests.get(url) # Faz um request em toda a url
    soup = BeautifulSoup(data.text) # Retorna o texto html do retorno
    table = soup.select('table.stats_table')[0] # Coletamos a primeira tabela
 
    links = table.find_all('a') # Capturamos todas as ancoras presentes nessa tabela
    links = [l.get("href") for l in links] # Capturamos somente os links
    links = [l for l in links if '/squads/' in l] # Pegamos somente os links que possui "/SQUADS/" no meio, pois como capturamos todas as ancoras na tabela, tem outros links irrelevantes presentes
    team_urls = [f"https://fbref.com{l}" for l in links] # Criamos uma lista com os links já estruturados para cada time
    
    previous_season = soup.select("#meta > div:nth-child(2) > div")[0].find('a').get("href") # Capturamos o link da season anterior
    url = f"https://fbref.com{previous_season}" # Capturamos o link da season anterior
    
    # Obtém os dados da página de cada time
    for team_url in team_urls:
        response = requests.get(team_url)

        try:
            # Extrai os dados da tabela "Standard Stats" em forma de dataframe
            team_player_data = pd.read_html(response.text, match="Standard Stats")[0]

            team_player_data.columns = team_player_data.columns.droplevel() # Dropamos a primeira coluna
            colunas_para_dropar = ['xG', 'npxG', 'xAG', 'npxG+xAG', 'G+A-PK', 'Matches','xG+xAG']
            team_player_data = team_player_data.loc[:, ~team_player_data.columns.duplicated(keep='first')]
            for coluna in colunas_para_dropar:
                # Verifica se a coluna está presente no dataframe antes de tentar removê-la
                if coluna in team_player_data.columns:
                    team_player_data.drop(columns=coluna, inplace=True)
        
            team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ") # Pegamos o nome do time através da url
            season = team_url.split("/")[-2][:9] if year != 2023 else '2022-2023' # Pegamos a temporada através da url
            
            # Criações de features (Colunas)
            team_player_data["Season"] = season
            team_player_data["Team"] = team_name
            team_player_data["Comp"] = COMP
            team_player_data["player_id"] = team_player_data.apply(create_player_id, axis=1)

            # Tratando algumas features
            column_rename = {"MP":"Matches Played","Starts":"Games started", "Min":"Minutes played", "90s":"Games without leave", "Gls":"Goals", "Ast":"Assists", "G+A":"Goals + Assists", "G-PK":"Non-Penalty Goals", "PK":"Penalty kicks made", "PKatt":"Penalty kicks attempted", "CrdY":"Yellow cards", "CrdR":"Red cards", "PrgC":"Progressive Carries", "PrgP":"Progressive Passes", "PrgR":"Progressive passes rec"}
            team_player_data.rename(columns=column_rename, inplace=True)
            
            # Adiciona os dados à lista de resultados
            teams_players_data_table.append(team_player_data)
            players_performance.append(team_player_data.loc[:, ~team_player_data.columns.isin(['Player', 'Nation', 'Age'])])
            players_table.append(team_player_data[['player_id','Player','Nation','Age']])

        except Exception as e:
            print(f"""A solicitação para {team_url} falhou.
                  Erro: {e} \n
                  código de status: {response.status_code}""")
            
        time.sleep(DELAY)

In [6]:
url = URL # reset na url que está modificada devido ao script acima

*2 - CAPTURANDO A TABELA DE INFORMAÇÃO GERAL DOS TIMES (Scores and Fixtures)*

In [None]:
teams_data_table = []
matches_table = []

for year in SEASON_RANGE:
    data = requests.get(url)
    soup = BeautifulSoup(data.text)
    table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("#meta > div:nth-child(2) > div")[0].find('a').get("href")
    url = f"https://fbref.com{previous_season}"
    
    # Obtém os dados da página de cada time
    for team_url in team_urls:
        response = requests.get(team_url)

        # Verifica se a solicitação foi bem-sucedida
        try:
            # Extrai os dados da segunda tabela "Scores and Fixtures" em forma de dataframe
            team_data = pd.read_html(response.text, match="Scores & Fixtures")[0]
        
            team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ") # Pegamos o nome do time através da url
            season = team_url.split("/")[-2][:9] if year != 2023 else '2022-2023' # Pegamos a temporada através da url

            # Criações de features (colunas)
            team_data["Season"] = season
            team_data["Team"] = team_name
            team_data["team_id"] = team_data.apply(create_team_id, axis=1)

            # Adiciona os dados à lista "teams_data_table"
            teams_data_table.append(team_data)
            matches_table.append(team_data.loc[:, ~team_data.columns.isin(['xG', 'xGA', 'Age','Notes','Match Report'])].rename(columns={"GF":"Goals for", "GA":"Goals against"}))

        except Exception as e:
            print(f"""A solicitação para {team_url} falhou.
                  Erro: {e} \n
                  código de status: {response.status_code}""")
            
        time.sleep(DELAY)

***PARA VISUALIZAR AS TABELAS***

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
players_table[0].head()

In [None]:
players_performance[0].head()

In [None]:
matches_table[0].head()

In [8]:
dict_players_performance = []
for dataframe in players_performance:
    dict_players_performance.append(dataframe.to_dict(orient="records"))

dict_players_table = []
for dataframe in players_table:
    dict_players_table.append(dataframe.to_dict(orient="records"))
    
dict_matchs_table = []
for dataframe in matches_table:
    dict_matchs_table.append(dataframe.to_dict(orient="records"))

In [10]:
from database import create_documents

In [11]:
for player_performance in dict_players_performance:
    documents = create_documents('players_performance', player_performance)
    
for player in dict_players_table:
    documents = create_documents('players', player)
    
for match in dict_matchs_table:
    documents = create_documents('matches', match)