# Carga completa
Esse notebook tem como objetivo realizar uma carga completa das partidas ocorridas no ano passado como parâmetro.

#### Atenção!!!
Esse notebook recriará todos os dados da tabela match

## Setup
Instalação de dependências

In [1]:
%pip install bs4
%pip install requests
%pip install sqlalchemy

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


## Setup
Imports necessários

In [2]:
import requests
from bs4 import BeautifulSoup
import re
import math
import pandas
from sqlalchemy import create_engine
import timeit

## Iniciando
Precisamos descobrir qual é a rodada atual do campeonato para saber quantas partidas já podemos ter disponíveis.

In [3]:
start = timeit.default_timer()

In [4]:
raw_turn_page = requests.get('https://www.cbf.com.br/futebol-brasileiro/competicoes/campeonato-brasileiro-serie-a')
turn_page = BeautifulSoup(raw_turn_page.content, 'html.parser')
active_turn = turn_page.find_all("div", class_="swiper-slide active")
if(active_turn):
    data_slide = re.findall('data-slide-index="\d+"',str(active_turn[0]))[0]
    turn = int(re.findall('\d+', data_slide)[0]) + 1
else:
     raise Exception("Tente novamente por favor, as rodadas precisam ser capturadas!")

In [5]:
print(f'Já ocorreram {turn} rodadas.')

Já ocorreram 27 rodadas.


In [6]:
rounds = turn * 10

In [7]:
print(f'Temos {rounds} partidas disponíveis.')

Temos 270 partidas disponíveis.


## Times no campeonato corrente
Temos que armazenar esses dados e buscar do banco...

In [8]:
cartola_clubs = {
    "Fluminense - RJ": { "globo_id": 266 },
    "Atlético - MG": { "globo_id": 282 },
    "Grêmio - RS": { "globo_id": 284 },
    "São Paulo - SP": { "globo_id": 276 },
    "Vasco da Gama - RJ": { "globo_id": 267 },
    "Corinthians - SP": { "globo_id": 264 },
    "Botafogo - RJ": { "globo_id": 296 },
    "Santos - SP": { "globo_id": 277 },
    "Cruzeiro - MG": { "globo_id": 283 },
    "Internacional - RS": { "globo_id": 285 },
    "Flamengo - RJ": { "globo_id": 262 },
    "Náutico - PE": { "globo_id": 343 },
    "Coritiba - PR": { "globo_id": 294 },
    "Ponte Preta - SP": { "globo_id": 303 },
    "Bahia - BA": { "globo_id": 265 },
    "Portuguesa - SP": { "globo_id": 278 },
    "Sport - PE": { "globo_id": 292 },
    "Palmeiras - SP": { "globo_id": 275 },
    "Atlético - GO": { "globo_id": 373 },
    "Figueirense - SC": { "globo_id": 316 },
    "Atlético Paranaense - PR": { "globo_id": 293 },
    "Vitória - BA": { "globo_id": 287 },
    "Goiás - GO": { "globo_id": 290 },
    "Criciuma - SC": { "globo_id": 288 },
    "Atletico - PR": { "globo_id": 293 },
    "Chapecoense - SC": { "globo_id": 315 },
    "Atlético - PR": { "globo_id": 293 },
    "Avaí - SC": { "globo_id": 314 },
    "Joinville - SC": { "globo_id": 317 },
    "Santa Cruz - PE": { "globo_id": 344 },
    "América - MG": { "globo_id": 327 },
    "Athletico Paranaense - PR": { "globo_id": 293 },
    "Ceará - CE": { "globo_id": 354, "count": 4 },
    "America Fc - MG": { "globo_id": 327 },
    "Paraná - PR": { "globo_id": 289 },
    "Fortaleza - CE": { "globo_id": 356 },
    "Csa - AL": { "globo_id": 341 },
    "Red Bull Bragantino - SP": { "globo_id": 280 },
    "Atlético Mineiro - MG": { "globo_id": 282 },
    "Juventude - RS": { "globo_id": 286 },
    "Cuiabá - MT": { "globo_id": 1371 },
    "America - MG": { "globo_id": 327 }
}

## Validator
Essa classe tem o papel de validar alguns dos atributos recebidos.

In [9]:
class Validator:
    
    def __init__(self, cartola_clubs):
        self.clubs = cartola_clubs
    
    def validate_hour(self, hour):
        if not re.match(r"\d{2}:\d{2}", hour):
            raise Exception('hour', f'The value {hour} dont match with the hour pattern hh:mm')
        return hour
    
    def validate_date(self, date):
        if not re.match(r"\w+, \d{2} de \w+ de \d{4}", date):
            raise Exception('date', f'The value {date} dont match with the date pattern EEE, dd de MMMM de yyyy')
        return date
    
    def validate_team(self, club):
        if club not in self.clubs.keys():
            raise Exception('club', f'The value {club} dont match with the name of the clubs')
        return club
    
    def validate_goal(self, goal):
        if not re.match(r"\d+", goal):
            raise Exception('goal', f'The value {goal} must be a number')
        return goal

## Crawler
Classe responsável por construir um registro de para cada partida

In [10]:
class Crawler:
    
    def __init__(self, game, year, match, cartola_clubs):
        self.game_soup = game
        self.cartola_clubs = cartola_clubs
        validator = Validator(cartola_clubs)

        self.year = year
        self.match = match
        self.turn = int(math.ceil(match / 10))

        self.local = self.game_soup_element_value('text-2 p-r-20', 0)
        self.date = validator.validate_date(self.game_soup_element_value('text-2 p-r-20', 1))
        self.hour = validator.validate_hour(self.game_soup_element_value('text-2 p-r-20', 2))
        
        self.home_team = validator.validate_team(self.game_soup_element_value('time-nome', 0))
        self.home_team_id = self.team_id(self.home_team)
        self.home_team_goals = validator.validate_goal(self.game_soup_element_value('time-gols block', 0))
        self.home_players_goals = self.players_goals('text-left')
        
        self.visitor_team = validator.validate_team(self.game_soup_element_value('time-nome', 1))
        self.visitor_team_id = self.team_id(self.visitor_team)
        self.visitor_team_goals = validator.validate_goal(self.game_soup_element_value('time-gols block', 1))
        self.visitor_players_goals = self.players_goals('text-right')
        
        self.referee = self.referee_soup_element_value(0)
        self.referee_category = self.referee_soup_element_value(1)
    
    def game_soup_element_value(self, css_class, position):
        return self.game_soup.find_all(class_=css_class)[position].get_text().strip()
    
    def referee_soup_element_value(self, position):
        return self.game_soup.find_all("td")[position].get_text().strip()
    
    def players_goals(self, css_class):
        players_goals = []
        players = self.game_soup.find_all(class_='col-xs-3 col-sm-3 '+ css_class +' hidden-xs')[0].get_text().strip().split("\n")
        
        for i in range(1, len(players)):
            name = players[i]
            if "'" in name:
                name = re.sub("'", "", name);
            if '"' in name:
                name = re.sub('"', "", name);

            players_goals.append(name)
            
        return players_goals
    
    def team_id(self, team_name):
        return self.cartola_clubs[team_name]["globo_id"]
        

    def to_dict(self):
        return {
            "match": self.match,
            "turn": self.turn,
            "year": self.year,
            "date": re.sub(',', '', self.date),
            "hour": self.hour,
            "local": self.local, 
            "home_id": self.home_team_id,
            "home": self.home_team, 
            "home_goal": self.home_team_goals,
            "home_players_goals": self.home_players_goals,
            "visitor_players_goals": self.visitor_players_goals,
            "visitor_goal": self.visitor_team_goals,
            "visitor": self.visitor_team,
            "visitor_id": self.visitor_team_id,
            "referee": self.referee,
            "referee_category": self.referee_category
        }

## Função search_matches
Função que tem o objetivo de buscar as partidas ocorridas no ano.

In [11]:
def search_matches(year):
    matches = []
    for match in range(1, rounds):
        page_game = requests.get(f'https://www.cbf.com.br/futebol-brasileiro/competicoes/campeonato-brasileiro-serie-a/{year}/{match}')
        game = BeautifulSoup(page_game.content, 'html.parser')
        try:
            craw = Crawler(game, year, match, cartola_clubs)
            matches.append(craw.to_dict())
        except Exception as e:
            print(f'Error on year {year} and match {match}. Error message: {str(e)}')
            continue
    return matches

## Dataframe das partidas
Por meio da função search_matches() buscamos as partidas que já ocorreram e montamos um dataframe

#### Observação: o ano está 'hardcoded' 2021

In [12]:
df_matches = pandas.DataFrame(search_matches(2021))

Error on year 2021 and match 1. Error message: list index out of range
Error on year 2021 and match 6. Error message: list index out of range
Error on year 2021 and match 7. Error message: list index out of range
Error on year 2021 and match 9. Error message: list index out of range
Error on year 2021 and match 15. Error message: ('date', 'The value A definir dont match with the date pattern EEE, dd de MMMM de yyyy')
Error on year 2021 and match 20. Error message: list index out of range
Error on year 2021 and match 22. Error message: list index out of range
Error on year 2021 and match 26. Error message: list index out of range
Error on year 2021 and match 38. Error message: ('goal', 'The value  must be a number')
Error on year 2021 and match 39. Error message: list index out of range
Error on year 2021 and match 44. Error message: list index out of range
Error on year 2021 and match 45. Error message: list index out of range
Error on year 2021 and match 50. Error message: list index 

In [13]:
df_matches.shape

(183, 16)

## Manutenção do Banco de dados
Nessa seção do notebook os dados do banco da tabela match serão **destruídos** e os novos dados serão inseridos.

In [14]:
def create_database_session():
    engine = create_engine('postgresql://postgres:postgres@172.18.0.2/cartola')
    return engine

In [45]:
def create_table_if_not_exists(engine):
    engine.execute('''
        CREATE TABLE IF NOT EXISTS match (
            match smallint, 
            turn smallint,
            year varchar(4),
            date varchar(50),
            hour varchar(5),
            local varchar(80),
            home_id smallint,
            home varchar(80),
            home_goal smallint,
            home_players_goals varchar(1024),
            visitor_players_goals varchar(1024),
            visitor_goal smallint,
            visitor varchar(80),
            visitor_id smallint,
            referee varchar(80),
            referee_category varchar(24)
        )'''
    )

In [46]:
def clear_matches_table(engine):
    engine.execute('TRUNCATE match')

In [42]:
def save_dataframe(engine, dataframe):
    dataframe.to_sql('match', con=engine, index=True, if_exists='replace')

In [43]:
def count_data(engine):
    return engine.execute('SELECT count(*) FROM match')

In [44]:
engine = create_database_session()
create_table_if_not_exists(engine)
clear_matches_table(engine)
save_dataframe(engine, df_matches)
count_data(engine).fetchall()

ProgrammingError: (psycopg2.errors.SyntaxError) syntax error at or near "autoincrement"
LINE 3:             id serial primary key autoincrement, 
                                          ^

[SQL: 
        CREATE TABLE IF NOT EXISTS match (
            id serial primary key autoincrement, 
            match smallint, 
            turn smallint,
            year varchar(4),
            date varchar(50),
            hour varchar(5),
            local varchar(80),
            home_id smallint,
            home varchar(80),
            home_goal smallint,
            home_players_goals varchar(1024),
            visitor_players_goals varchar(1024),
            visitor_goal smallint,
            visitor varchar(80),
            visitor_id smallint,
            referee varchar(80),
            referee_category varchar(24)
        )]
(Background on this error at: https://sqlalche.me/e/14/f405)

In [35]:
end = timeit.default_timer()
print ('Duração: %.f minutos' % ((end - start) / 60 ))

Duração: 19 minutos
