# Corpus compilation
- text type and genres
- characteristics according to task

In [1]:
# Requests in python
import urllib3
# Regular expression library
import re
# Transform html into a tree im memory
from bs4 import BeautifulSoup
import os
# Get just the text in a html document
import justext

In [2]:
# Disable https warning
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Pass as a browser 
user_agent = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}

# Pool of boots to make requests
http = urllib3.PoolManager(10, headers=user_agent)


class Crawler:
    
    def __init__(self, corpus_path, max_files, seed_url, url_pattern):
        self.corpus_path = corpus_path # corpus address
        self.max_files = max_files # max amount of sizes
        self.seed_url = seed_url # root url
        self.url_pattern = url_pattern # Select links of interest
        self.visited_links = [] # Hash to store viseted links
        self.to_be_visited = [] # List of link
        
        # If path not exists create it
        if not os.path.exists(self.corpus_path):
            os.makedirs(self.corpus_path)
        
    def crawl(self):
        first_urls = self.get_page(self.seed_url)
        self.add_links(first_urls)
        next_link = self.get_next_link()
        
        file_counter = 1
        while next_link and file_counter < self.max_files:
            links = self.get_page(next_link)
            self.add_links(links)
            next_link = self.get_next_link()
            file_counter += 1
        
    
    def get_page(self, url):
        print("getting page {}".format(url))
        response = http.request('GET', url)

        # store text content
        paragraphs = justext.justext(response.data, justext.get_stoplist("Portuguese"))
        with open("{}/{}.txt".format(self.corpus_path, url.replace(".", "_").replace("/","-")[8:]), "w") as output_file:
            for paragraph in paragraphs:
                # Boilerplate is everthing that is not the main text
                if not paragraph.is_boilerplate:
                    output_file.write(paragraph.text)
        
        # get links
        soup = BeautifulSoup(response.data, 'html.parser')
        
        links = [link.get('href') for link in soup.findAll('a', attrs={'href': re.compile(self.url_pattern)})]
        return links

    def add_links(self, links):
        links = list(set(links))
        self.to_be_visited.extend([link for link in links if link not in self.visited_links])

    def get_next_link(self):
        next_link = self.to_be_visited.pop(0)
        self.visited_links.append(next_link)
        return next_link

In [5]:
crawler_cultura = Crawler("data/corpora/cultura", 500, 
                             "https://www1.folha.uol.com.br/ilustrada/2019/04/morre-fotografo-alemao-conhecido-pelo-registro-de-megalopoles.shtml", 
                            "^https://www1.\folha\.uol\.com\.br/ilustrada/")

crawler_folha_politica = Crawler("data/corpora/politica", 5000,
                          "https://www1.folha.uol.com.br/poder/2019/04/bolsonaro-minimiza-crise-com-mourao-e-diz-que-a-briga-e-por-quem-lava-a-louca.shtml",
                          "^https://www1\.folha\.uol\.com\.br/poder/\d+")

crawler_uol_politica = Crawler("data/corpora/politica", 500, "https://economia.uol.com.br/noticias/redacao/2019/04/25/reforma-da-previdencia-bpc-rural-abono.htm",
                              "^https://economia\.uol\.com\.br/noticias/")

crawler_g1_politica = Crawler("data/corpora/politica", 500, "https://g1.globo.com/politica/noticia/2019/04/25/mpf-divulga-estudo-que-revela-violacoes-de-direitos-de-indios-guarani-na-construcao-de-itaipu.ghtml",
                             "https://g1\.globo\.com/politica/noticia/\d+")

crawler_paragmatismo_politica = Crawler("data/corpora/politica", 500, "https://www.pragmatismopolitico.com.br/2019/04/ataques-carlos-bolsonaro-contra-mourao-twitter.html",
                                       "^https://www\.pragmatismopolitico\.com\.br/2019/\d+")

crawler_oglobo_politica = Crawler("data/corpora/politica", 500, "https://blogs.oglobo.globo.com/lauro-jardim/post/bolsonaro-veta-campanha-do-banco-do-brasil-marcada-pela-diversidade-e-diretor-cai-veja-o-video-proibido.html",
                                 "^https://blogs\.oglobo\.globo\.com/")

crawler_carta_politica = Crawler("data/corpora/politica", 500, "https://www.cartacapital.com.br/politica/em-meio-a-ataques-de-carlos-bolsonaro-maia-nega-impeachment-de-mourao/",
                                "^https://www\.cartacapital\.com\.br/politica/")


In [6]:
# crawler_cultura.crawl()
crawler_folha_politica.crawl()
# crawler_uol_politica.crawl()
# crawler_carta_politica.crawl()
# crawler_paragmatismo_politica.crawl()
# crawler_oglobo_politica.crawl()


getting page https://www1.folha.uol.com.br/ilustrada/2019/04/morre-fotografo-alemao-conhecido-pelo-registro-de-megalopoles.shtml


IndexError: pop from empty list