## Cartola Web Crawler
Esse notebook tem como objetivo capturar dados dos jogos do Campeonato Brasileiro.
A CBF disponibiliza um web site que apresenta uma interface html muito simples e facilita o trabalho de web crawling.
Desse site estaremos extraindo os dados do campeonato desde a data de 2012.
Os dados que estamos interessados:

* ano do jogo;
* número do jogo;
* número da partida;
* data do jogo;
* local do jogo;
* horário do jogo;
* time da casa (ID conforme tabela da Globo);
* gols do time da casa;
* time visitante (ID conforme tabela da Globo);
* gols do time visitante;
* quais jogadores fizeram gols;
* arbitro principal da partida;

In [2]:
%pip install bs4
%pip install requests

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
import re
import math
import json
import pandas as pd
from bs4 import BeautifulSoup

In [4]:
class Crawler:
    
    def __init__(self, game, referee, year, match, cartola_clubs):
        self.game_soup = game
        self.referee_soup = referee
        self.cartola_clubs = cartola_clubs

        self.year = year
        self.match = match
        self.turn = int(math.ceil(match / 10))

        self.local = self.game_soup_element_value('text-2 p-r-20', 0)
        self.date = self.game_soup_element_value('text-2 p-r-20', 1)
        self.hour = self.game_soup_element_value('text-2 p-r-20', 2)
        
        self.home_team = self.game_soup_element_value('time-nome', 0)
        self.home_team_id = self.team_id(self.home_team)
        self.home_team_goals = self.game_soup_element_value('time-gols block', 0)
        self.home_players_goals = self.players_goals('text-left')
        
        self.visitor_team = self.game_soup_element_value('time-nome', 1)
        self.visitor_team_id = self.team_id(self.visitor_team)
        self.visitor_team_goals = self.game_soup_element_value('time-gols block', 1)
        self.visitor_players_goals = self.players_goals('text-right')
        
        self.referee = self.referee_soup_element_value(0)
        self.referee_category = self.referee_soup_element_value(1)
    
    def game_soup_element_value(self, css_class, position):
        return self.game_soup.find_all(class_=css_class)[position].get_text().strip()
    
    def referee_soup_element_value(self, position):
        return self.referee_soup.find_all("td")[position].get_text().strip()
    
    def players_goals(self, css_class):
        players_goals = []
        players = self.game_soup.find_all(class_='col-xs-3 col-sm-3 '+ css_class +' hidden-xs')[0].get_text().strip().split("\n")
        
        for i in range(1, len(players)):
            name = players[i]
            if "'" in name:
                name = re.sub("'", "", name);
            if '"' in name:
                name = re.sub('"', "", name);

            players_goals.append(name)
            
        return players_goals
    
    def team_id(self, team_name):
        return self.cartola_clubs[team_name]["globo_id"]
        

    def to_dict(self):
        return {
            "match": self.match,
            "turn": self.turn,
            "year": self.year,
            "date": re.sub(',', '', self.date),
            "hour": self.hour,
            "local": self.local, 
            "home_id": self.home_team_id,
            "home": self.home_team, 
            "home_goal": self.home_team_goals,
            "home_players_goals": self.home_players_goals,
            "visitor_players_goals": self.visitor_players_goals,
            "visitor_goal": self.visitor_team_goals,
            "visitor": self.visitor_team,
            "visitor_id": self.visitor_team_id,
            "referee": self.referee,
            "referee_category": self.referee_category
        }

In [5]:
# padronizar
clubs = requests.get('https://api.cartolafc.globo.com/clubes')

In [None]:
json.loads(clubs.content)

In [6]:
cartola_clubs_2021 = {
    "Fluminense - RJ": { "globo_id": 266, "count": 10, "year": [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021] },
    "Atlético - MG": { "globo_id": 282, "count": 9, "year": [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020] },
    "Grêmio - RS": { "globo_id": 284, "count": 10, "year": [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021] },
    "São Paulo - SP": { "globo_id": 276, "count": 10, "year": [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021] },
    "Vasco da Gama - RJ": { "globo_id": 267, "count": 7, "year": [2012, 2013, 2015, 2017, 2018, 2019, 2020] },
    "Corinthians - SP": { "globo_id": 264, "count": 10, "year": [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021] },
    "Botafogo - RJ": { "globo_id": 296, "count": 8, "year": [2012, 2013, 2014, 2016, 2017, 2018, 2019, 2020] },
    "Santos - SP": { "globo_id": 277, "count": 10, "year": [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021] },
    "Cruzeiro - MG": { "globo_id": 283, "count": 8, "year": [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019] },
    "Internacional - RS": { "globo_id": 285, "count": 9, "year": [2012, 2013, 2014, 2015, 2016, 2018, 2019, 2020, 2021] },
    "Flamengo - RJ": { "globo_id": 262, "count": 10, "year": [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021] },
    "Náutico - PE": { "globo_id": 343, "count": 2, "year": [2012, 2013] },
    "Coritiba - PR": { "globo_id": 294, "count": 7, "year": [2012, 2013, 2014, 2015, 2016, 2017, 2020] },
    "Ponte Preta - SP": { "globo_id": 303, "count": 5, "year": [2012, 2013, 2015, 2016, 2017] },
    "Bahia - BA": { "globo_id": 265, "count": 8, "year": [2012, 2013, 2014, 2017, 2018, 2019, 2020, 2021] },
    "Portuguesa - SP": { "globo_id": 278, "count": 2, "year": [2012, 2013] },
    "Sport - PE": { "globo_id": 292, "count": 8, "year": [2012, 2014, 2015, 2016, 2017, 2018, 2020, 2021] },
    "Palmeiras - SP": { "globo_id": 275, "count": 9, "year": [2012, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021] },
    "Atlético - GO": { "globo_id": 373, "count": 4, "year": [2012, 2017, 2020, 2021] },
    "Figueirense - SC": { "globo_id": 316, "count": 4, "year": [2012, 2014, 2015, 2016] },
    "Atlético Paranaense - PR": { "globo_id": 293, "count": 1, "year": [2013] },
    "Vitória - BA": { "globo_id": 287, "count": 5, "year": [2013, 2014, 2016, 2017, 2018] },
    "Goiás - GO": { "globo_id": 290, "count": 5, "year": [2013, 2014, 2015, 2019, 2020] },
    "Criciuma - SC": { "globo_id": 288, "count": 2, "year": [2013, 2014] },
    "Atletico - PR": { "globo_id": 293, "count": 1, "year": [2014] },
    "Chapecoense - SC": { "globo_id": 315, "count": 7, "year": [2014, 2015, 2016, 2017, 2018, 2019, 2021] },
    "Atlético - PR": { "globo_id": 293, "count": 3, "year": [2015, 2016, 2017] },
    "Avaí - SC": { "globo_id": 314, "count": 3, "year": [2015, 2017, 2019] },
    "Joinville - SC": { "globo_id": 317, "count": 1, "year": [2015] },
    "Santa Cruz - PE": { "globo_id": 344, "count": 1, "year": [2016] },
    "América - MG": { "globo_id": 327, "count": 1, "year": [2016] },
    "Athletico Paranaense - PR": { "globo_id": 293, "count": 4, "year": [2018, 2019, 2020, 2021] },
    "Ceará - CE": { "globo_id": 354, "count": 4, "year": [2018, 2019, 2020, 2021] },
    "America Fc - MG": { "globo_id": 327, "count": 1, "year": [2018] },
    "Paraná - PR": { "globo_id": 289, "count": 1, "year": [2018] },
    "Fortaleza - CE": { "globo_id": 356, "count": 3, "year": [2019, 2020, 2021] },
    "Csa - AL": { "globo_id": 341, "count": 1, "year": [2019] },
    "Red Bull Bragantino - SP": { "globo_id": 280, "count": 2, "year": [2020, 2021] },
    "Atlético Mineiro - MG": { "globo_id": 282, "count": 1, "year": [2021] },
    "Juventude - RS": { "globo_id": 286, "count": 1, "year": [2021] },
    "Cuiabá - MT": { "globo_id": 1371, "count": 1, "year": [2021] },
    "America - MG": { "globo_id": 327, "count": 1, "year": [2021] }
}

In [10]:
matches = []
for year in range(2012, 2021):
    for match in range(1, 381):
        page_game = requests.get(f'https://www.cbf.com.br/futebol-brasileiro/competicoes/campeonato-brasileiro-serie-a/{year}/{match}')
        game = BeautifulSoup(page_game.content, 'html.parser')
        page_referee = requests.get(f'https://www.cbf.com.br/futebol-brasileiro/competicoes/campeonato-brasileiro-serie-a/{year}/{match}#arbitros')
        referee = BeautifulSoup(page_referee.content, 'html.parser')
        try:
            craw = Crawler(game, referee, year, match, cartola_clubs_2021)
            matches.append(craw.to_dict())
        except Exception as e:
            print(f'Error on year {year} and match {match}')
            continue
            

Error on year 2016 and match 378


In [11]:
df = pd.DataFrame(matches)
df.tail()

Unnamed: 0,match,turn,year,date,hour,local,home_id,home,home_goal,home_players_goals,visitor_players_goals,visitor_goal,visitor,visitor_id,referee,referee_category
3414,376,38,2020,Quinta 25 de Fevereiro de 2021,21:30,Beira-Rio - Porto Alegre - RS,285,Internacional - RS,0,[],[],0,Corinthians - SP,264,Wilton Pereira Sampaio,FIFA
3415,377,38,2020,Quinta 25 de Fevereiro de 2021,21:30,Arena Fonte Nova - Salvador - BA,265,Bahia - BA,2,"[ Rossi 14 (1ºT), Alesson 45+1 (2ºT)]",[],0,Santos - SP,277,Paulo Roberto Alves Junior,AB
3416,378,38,2020,Quinta 25 de Fevereiro de 2021,21:30,Arena da Baixada - Curitiba - PR,293,Athletico Paranaense - PR,2,"[ Nikao 7 (1ºT), Cittadini 40 (2ºT)]",[],0,Sport - PE,292,Flavio Rodrigues de Souza,FIFA
3417,379,38,2020,Quinta 25 de Fevereiro de 2021,21:30,Arena Castelão - Fortaleza - CE,354,Ceará - CE,2,"[ Pedrinho 17 (1ºT), Saulo 45+3 (2ºT)]",[Matheus Babi 11 (2ºT)],1,Botafogo - RJ,296,Raphael Claus,FIFA
3418,380,38,2020,Quinta 25 de Fevereiro de 2021,21:30,Antônio Accioly - Goiania - GO,373,Atlético - GO,3,"[ Gilvan 3 (1ºT), Wellington Rato 33 (1ºT), ...",[Ricardo Oliveira 17 (2ºT)],1,Coritiba - PR,294,Edina Alves Batista,FIFA


In [12]:
df.to_csv("cbf-2012-2020_matches.csv", encoding="utf-8", index=False)