# Webscraping Brazilian National Football League

## Setup the env

First of all uncomment the first cell to download all the libs. That cell only need to be execute once.

In [7]:
# Uncomment the line
# !pip install BeautifulSoup requests

Now import all libraries need for the project

In [36]:
import pandas as pd
import json
from bs4 import BeautifulSoup, Tag, NavigableString
import requests
from typing import List, Tuple
import re
from datetime import datetime

## Declare functions

First we need a function that will do the scrap for us.

In [21]:
def scrap(urls: List[str]) -> List[BeautifulSoup]:
    responses = (requests.get(url) for url in urls)
    soups = [BeautifulSoup(response.text, 'html.parser') for response in responses]
    return soups

The date data will come in the following format: e.g. 01.abr.2024.
So we need to create a function to format the date string.

In [42]:
def format_date(date_str: str) -> str:
    months = {
        'jan': '01', 'feb': '02', 'mar': '03', 'abr': '04',
        'mai': '05', 'jun': '06', 'jul': '07', 'ago': '08',
        'set': '09', 'out': '10', 'nov': '11', 'dez': '12'
    }
    day, month_abbr, year = date_str.split('.')
    month = months.get(month_abbr[:3].lower())
    return f"{day}/{month}/{year}"

Each round is in a `tr` so we need to extract all `td`.

In [46]:
def extract_data_table(table: Tag | NavigableString, competition: str, season: int) -> Tuple[List[str], List[str], List[str], List[str], List[str], List[str], List[str], List[int]]:
    dates = []
    times = []
    hosts = []
    visitors = []
    scores = []
    rounds = []
    competitions = []
    seasons = []

    for line in table.find_all('tr', id=True):
        columns = line.find_all('td')
        if len(columns) >= 6:
            datetime_str = columns[1].get_text(strip=True)

            date_match = re.search(r"(\d{2}\.\w{3}\.\d{4})", datetime_str)
            time_match = re.search(r"(\d{2}:\d{2})", datetime_str)

            date = format_date(date_match.group(0)) if date_match else None
            time = time_match.group(0) if time_match else None

            host = columns[2].get_text(strip=True)
            score = columns[3].get_text(strip=True)
            visitor = columns[4].get_text(strip=True)
            round = columns[5].get_text(strip=True)

            dates.append(date)
            times.append(time)
            hosts.append(host)
            scores.append(score)
            visitors.append(visitor)
            rounds.append(round)
            competitions.append(competition)
            seasons.append(season)

    return dates, times, hosts, visitors, scores, rounds, competitions, seasons

That's the main function to to get all the data and turn into a dataframe object

In [52]:
def extract_table(soups: List[BeautifulSoup]) -> pd.DataFrame:
    all_dates, all_times, all_hosts, all_visitors, all_scores, all_rounds, all_competitions, all_seasons = [], [], [], [], [], [], [], []

    for soup in soups:
        table = soup.find('table', class_='competition-rounds competition-half-padding')
        bread_ul_lis = soup.find('div', class_='breadcrumbs').find_all('li')

        competition = bread_ul_lis[2].a['title']
        season = bread_ul_lis[3].a['title']

        if table:
            dates, times, hosts, visitors, scores, rounds, competitions, seasons = extract_data_table(table=table,
                                                                                                      competition=competition,
                                                                                                      season=season)
            all_dates.extend(dates)
            all_times.extend(times)
            all_hosts.extend(hosts)
            all_visitors.extend(visitors)
            all_scores.extend(scores)
            all_rounds.extend(rounds)
            all_competitions.extend(competitions)
            all_seasons.extend(seasons)

    df = pd.DataFrame({
        "Date": all_dates,
        "Time": all_times,
        "Host": all_hosts,
        "Visitor": all_visitors,
        "Score": all_scores,
        "Round": all_rounds,
        "Competition": all_competitions,
        "Season": all_seasons,
    })

    return df

In [61]:
def get_competitions_data(id_competition: str):
  urls = (f"https://www.academiadasapostasbrasil.com/stats/competition/brasil/26/{id_competition}/all-games",
          f"https://www.academiadasapostasbrasil.com/stats/competition/brasil/26/{id_competition}/all-games/page/2")
  soups = scrap(urls=urls)
  df = extract_table(soups=soups)

  return df

## Extract data

In [55]:
ID_COMPETITIONS = ('MPvoQox4VYlOy', 'NR2zmkLnBQ08o', 'gEyAmA0DWZ2po', '91BVQJ6p7Qwa6', '18427', '16888')



In [12]:
URLS = ("https://www.academiadasapostasbrasil.com/stats/competition/brasil/26/NR2zmkLnBQ08o/all-games",
        "https://www.academiadasapostasbrasil.com/stats/competition/brasil/26/NR2zmkLnBQ08o/all-games/page/2")

In [51]:
soups = scrap(urls=URLS)
df = extract_table(soups=soups)

df

Unnamed: 0,Date,Time,Host,Visitor,Score,Round,Competition,Season
0,15/04/2023,16:00,Palmeiras,Cuiabá,2-1,J1,Brasileirão Série A Brasil,2023
1,15/04/2023,16:00,América Mineiro,Fluminense,0-3,J1,Brasileirão Série A Brasil,2023
2,15/04/2023,18:30,Botafogo,São Paulo,2-1,J1,Brasileirão Série A Brasil,2023
3,15/04/2023,18:30,Fortaleza,Internacional,1-1,J1,Brasileirão Série A Brasil,2023
4,15/04/2023,18:30,Athletico PR,Goiás,2-0,J1,Brasileirão Série A Brasil,2023
...,...,...,...,...,...,...,...,...
375,06/12/2023,21:30,Cuiabá,Athletico PR,3-0,J38,Brasileirão Série A Brasil,2023
376,06/12/2023,21:30,Santos,Fortaleza,1-2,J38,Brasileirão Série A Brasil,2023
377,06/12/2023,21:30,Internacional,Botafogo,3-1,J38,Brasileirão Série A Brasil,2023
378,06/12/2023,21:30,Fluminense,Grêmio,2-3,J38,Brasileirão Série A Brasil,2023


Extract data from competition id

In [62]:
dfs = [get_competitions_data(id_competition=id_competition) for id_competition in ID_COMPETITIONS]

In [63]:
df_final = pd.concat(dfs, ignore_index=True)

In [64]:
df_final

Unnamed: 0,Date,Time,Host,Visitor,Score,Round,Competition,Season
0,13/04/2024,18:30,Internacional,Bahia,2-1,J1,Brasileirão Série A Brasil,2024
1,13/04/2024,18:30,Criciúma,Juventude,1-1,J1,Brasileirão Série A Brasil,2024
2,13/04/2024,21:00,Fluminense,Bragantino,2-2,J1,Brasileirão Série A Brasil,2024
3,13/04/2024,21:00,São Paulo,Fortaleza,1-2,J1,Brasileirão Série A Brasil,2024
4,14/04/2024,16:00,Athletico PR,Cuiabá,4-0,J1,Brasileirão Série A Brasil,2024
...,...,...,...,...,...,...,...,...
2275,08/12/2019,16:00,Vasco da Gama,Chapecoense,1-1,J38,Brasileirão Série A Brasil,2019
2276,08/12/2019,16:00,Botafogo,Ceará,1-1,J38,Brasileirão Série A Brasil,2019
2277,08/12/2019,16:00,Avaí,Athletico PR,0-0,J38,Brasileirão Série A Brasil,2019
2278,08/12/2019,16:00,Goiás,Grêmio,3-2,J38,Brasileirão Série A Brasil,2019
