# Corpus compilation
- text type and genres
- characteristics according to task

In [1]:
import urllib3
import re
from bs4 import BeautifulSoup
import os
import justext

In [2]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
user_agent = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}

http = urllib3.PoolManager(10, headers=user_agent)


class Crawler:
    
    def __init__(self, corpus_path, max_files, seed_url, url_pattern):
        self.corpus_path = corpus_path
        self.max_files = max_files
        self.seed_url = seed_url
        self.url_pattern = url_pattern
        self.visited_links = {}
        self.to_be_visited = []
        
        if not os.path.exists(self.corpus_path):
            os.makedirs(self.corpus_path)
        
    def crawl(self):
        first_urls = self.get_page(self.seed_url)
        self.add_links(first_urls)
        next_link = self.get_next_link()
        
        file_counter = 1
        while next_link and file_counter < self.max_files:
            links = self.get_page(next_link)
            self.add_links(links)
            next_link = self.get_next_link()
            file_counter += 1
    
    def get_page(self, url):
        print("getting page {}".format(url))
        response = http.request('GET', url)

        # store text content
        paragraphs = justext.justext(response.data, justext.get_stoplist("Portuguese"))
        with open("{}/{}.txt".format(self.corpus_path, url.replace(".", "_").replace("/","-")), "w") as output_file:
            for paragraph in paragraphs:
                if not paragraph.is_boilerplate:
                    output_file.write(paragraph.text)
        
        # get links
        soup = BeautifulSoup(response.data, 'html.parser')
        links = [link.get('href') for link in soup.findAll('a', attrs={'href': re.compile(self.url_pattern)})]

        return links

    def add_links(self, links):
        links = list(set(links))
        self.to_be_visited.extend([link for link in links if link not in self.visited_links])

    def get_next_link(self):
        next_link = self.to_be_visited.pop(0)
        self.visited_links[next_link] = None
        return next_link

In [3]:
crawler_tecnologia = Crawler("data/corpora/tecnologia", 50, "https://tecnoblog.net/281950/", "^https://tecnoblog\.net/\d+")
crawler_politica = Crawler("data/corpora/politica", 50, "http://blogs.opovo.com.br/politica/",
                           "^http://blogs\.opovo\.com\.br/politica/\d+")

In [4]:
crawler_politica.crawl()
crawler_tecnologia.crawl()

getting page http://blogs.opovo.com.br/politica/
getting page http://blogs.opovo.com.br/politica/2019/04/09/projeto-regula-jogos-eletronicos-como-esporte-e-gamers-como-atletas/
getting page http://blogs.opovo.com.br/politica/2019/04/09/projeto-regula-jogos-eletronicos-como-esporte-e-gamers-como-atletas/#respond
getting page http://blogs.opovo.com.br/politica/2019/04/09/deputado-cearense-quer-convocar-novo-ministro-da-educacao-para-prestar-esclarecimentos/
getting page http://blogs.opovo.com.br/politica/2019/04/08/analise-escolha-de-olavista-para-o-mec-pode-ampliar-desgaste-com-militares/#respond
getting page http://blogs.opovo.com.br/politica/2019/04/08/bolsonaro-demite-ministro-da-educacao-e-anuncia-substituto/#respond
getting page http://blogs.opovo.com.br/politica/2019/04/06/decisao-de-retirar-maracanau-de-programa-foi-de-moro-diz-deputado/
getting page http://blogs.opovo.com.br/politica/2019/04/03/guedes-diz-que-quem-discorda-da-necessidade-da-reforma-da-previdencia-tem-que-ser-int

getting page https://tecnoblog.net/281950/tecnocast-109-como-nascem-os-reviews/
getting page https://tecnoblog.net/282772/disney-completa-compra-fox/
getting page https://tecnoblog.net/159783/material-preto-mais-preto-de-todos-reflete-004-luz/
getting page https://tecnoblog.net/231603/lg-tv-oled-enrolavel/
getting page https://tecnoblog.net/282778/4g-velocidade-horario-pico-opensignal-2019/
getting page https://tecnoblog.net/272175/tv-oled-sony-a8f-review/
getting page https://tecnoblog.net/239766/lg-oled-tv-w8-c8-2018-lancamento-preco-brasil/
getting page https://tecnoblog.net/231107/lg-tv-oled-8k-88-polegadas/
getting page https://tecnoblog.net/283244/lg-k12-plus-review/
getting page https://tecnoblog.net/283574/parlamento-europeu-artigo-13-copyright/
getting page https://tecnoblog.net/282870/amazon-novo-kindle-2019/
getting page https://tecnoblog.net/282688/foruns-chans-deep-web-massacres/
getting page https://tecnoblog.net/281796/amazon-echo-alexa-portugues-brasil-beta/
getting pag