# Corpus compilation
- text type and genres
- characteristics according to task

In [2]:
import urllib3
import re
from bs4 import BeautifulSoup
import os
import justext

In [3]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
user_agent = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}

http = urllib3.PoolManager(10, headers=user_agent)


class Crawler:
    
    def __init__(self, corpus_path, max_files, seed_url, url_pattern):
        self.corpus_path = corpus_path
        self.max_files = max_files
        self.seed_url = seed_url
        self.url_pattern = url_pattern
        self.visited_links = {}
        self.to_be_visited = []
        
        if not os.path.exists(self.corpus_path):
            os.makedirs(self.corpus_path)
        
    def crawl(self):
        first_urls = self.get_page(self.seed_url)
        self.add_links(first_urls)
        next_link = self.get_next_link()
        
        file_counter = 1
        while next_link and file_counter < self.max_files:
            links = self.get_page(next_link)
            self.add_links(links)
            next_link = self.get_next_link()
            file_counter += 1
    
    def get_links(self, page_data):
        links = re.findall(self.url_pattern, str(page_data))
        return links            
    
    def get_page(self, url):
        print("getting page {}".format(url))
        response = http.request('GET', url)

        # store text content
        paragraphs = justext.justext(response.data, justext.get_stoplist("Portuguese"))
        with open("{}/{}.txt".format(self.corpus_path, url.replace(".", "_").replace("/","-")), "w") as output_file:
            for paragraph in paragraphs:
                if not paragraph.is_boilerplate:
                    output_file.write(paragraph.text)
        
        # get links
        links = self.get_links(response.data)
        
        return links

    def add_links(self, links):
        links = list(set(links))
        self.to_be_visited.extend([link for link in links if link not in self.visited_links])

    def get_next_link(self):
        next_link = self.to_be_visited.pop(0)
        self.visited_links[next_link] = None
        return next_link

In [None]:
crawler_tecnologia = Crawler("data/corpora/tecnologia", 500, "https://tecnoblog.net/281950/",
                             '"(https://tecnoblog\.net/\d+.*?)"')
crawler_tecnologia.crawl()

In [5]:
crawler_tecnologia = Crawler("data/corpora/tecnologia", 500, "http://digitaldrops.com.br/2019/03/",
                             '"(http://digitaldrops\.com\.br/\d+.*?)"')
crawler_tecnologia.crawl()

getting page http://digitaldrops.com.br/2019/03/
getting page http://digitaldrops.com.br/2019/03/novos-ipad-air-ipad-mini-com-processador-a12-bionic.html
getting page http://digitaldrops.com.br/2019/03/imac-processador-8a-9a-geracoes-nova-gpu.html
getting page http://digitaldrops.com.br/2019/03/linha-galaxy-s10-custa-caro-no-brasil-mas-tem-boas-promocoes.html
getting page http://digitaldrops.com.br/2019/03/asus-zenfone-max-shot-e-zenfone-max-plus-m2-sip-1-qualcomm.html
getting page http://digitaldrops.com.br/2019/03/capita-marvel-resenha-quase-sem-spoilers.html
getting page http://digitaldrops.com.br/2019/03/novidades-da-apple-streaming-apple-tv-plus-apple-news-apple-card-apple-arcade.html
getting page http://digitaldrops.com.br/2019/03/nova-geracao-apple-airpods-hey-siri-comando-voz-case-sem-fio.html
getting page http://digitaldrops.com.br/2019/03/stadia-google-sistema-de-games-streaming.html
getting page http://digitaldrops.com.br/2019/03/novos-ipad-air-ipad-mini-com-processador-a12-

IndexError: pop from empty list

In [4]:
crawler_politica = Crawler("data/corpora/politica", 500, "http://blogs.opovo.com.br/politica/",
                           '"(http://blogs\.opovo\.com\.br/politica/\d+.*?)"')
crawler_politica.crawl()

getting page http://blogs.opovo.com.br/politica/
getting page http://blogs.opovo.com.br/politica/2019/04/16/camara-aprova-titulo-de-cidada-de-fortaleza-para-michelle-bolsonaro/
getting page http://blogs.opovo.com.br/politica/2019/04/15/andre-fernandes-atribui-queda-de-mortes-no-ce-a-governo-bolsonaro/
getting page http://blogs.opovo.com.br/politica/2019/04/16/em-100-dias-celio-studart-apresenta-23-projetos-sobre-animais/#respond
getting page http://blogs.opovo.com.br/politica/2019/04/18/presidencia-e-diretorias-do-bnb-entram-lista-para-indicacao-de-parlamentares/
getting page http://blogs.opovo.com.br/politica/2019/04/16/bolsonaro-lula-e-dilma-deram-passaportes-diplomaticos-a-edir-macedo-e-pelas-mesmas-razoes/
getting page http://blogs.opovo.com.br/politica/2019/04/16/em-100-dias-celio-studart-apresenta-23-projetos-sobre-animais/
getting page http://blogs.opovo.com.br/politica/2019/04/15/bolsonaro-concede-passaporte-diplomatico-para-bispo-edir-macedo-e-esposa/#respond
getting page http