# Corpus compilation
- text type and genres
- characteristics according to task

In [1]:
import urllib3
import re
from bs4 import BeautifulSoup

In [2]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
user_agent = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}

http = urllib3.PoolManager(10, headers=user_agent)


class Crawler:
    
    def __init__(self, corpus_path, max_files, seed_url, url_pattern, clean_with_heuristics=True):
        self.corpus_path = corpus_path
        self.max_files = max_files
        self.seed_url = seed_url
        self.url_pattern = url_pattern
        self.clean_with_heuristics = clean_with_heuristics
        self.visited_links = {}
        self.to_be_visited = []
        
    def crawl(self):
        first_urls = self.get_page(self.seed_url)
        self.add_links(first_urls)
        next_link = self.get_next_link()
        
        file_counter = 1
        while next_link and file_counter < self.max_files:
            links = self.get_page(next_link)
            self.add_links(links)
            next_link = self.get_next_link()
            file_counter += 1

    def heuristics_clean(self, lines):
        # lines with one or two word only
        lines = [line for line in lines if len(line.split()) > 3]
        return lines

    def get_page(self, url):
        print("getting page:", url)
        page = http.urlopen("GET", url)
        soup = BeautifulSoup(page.data, 'html.parser')

        # remove some undesired tags
        for script in soup(["script", "style", "meta", "title", "head"]):
            script.extract()
        text = soup.get_text()

        # remove blank lines
        lines = [line for line in text.split("\n") if len(line.strip()) > 0]

        # get links
        links = [link.get('href') for link in soup.findAll('a', attrs={'href': re.compile(self.url_pattern)})]

        if self.clean_with_heuristics:
            lines = self.heuristics_clean(lines)

        # store text content
        with open("{}/{}.txt".format(self.corpus_path, url.replace(".", "_").replace("/","-")), "w") as output_file:
            output_file.write("\n".join(lines))

        return links

    def add_links(self, links):
        links = list(set(links))
        self.to_be_visited.extend([link for link in links if link not in self.visited_links])

    def get_next_link(self):
        next_link = self.to_be_visited.pop(0)
        self.visited_links[next_link] = None
        return next_link

In [3]:
crawler_tecnologia = Crawler("data/corpora/tecnologia", 100, "https://tecnoblog.net/281950/", "^https://tecnoblog\.net/\d+")
crawler_politica = Crawler("data/corpora/politica", 100, "http://blogs.opovo.com.br/politica/",
                           "^http://blogs\.opovo\.com\.br/politica/\d+")

In [4]:
crawler_politica.crawl()
crawler_tecnologia.crawl()

getting page: http://blogs.opovo.com.br/politica/
getting page: http://blogs.opovo.com.br/politica/2019/03/13/monstruosidade-e-covardia-sem-tamanho-diz-bolsonaro-sobre-tragedia-em-suzano/#respond
getting page: http://blogs.opovo.com.br/politica/2019/03/13/ciro-e-cid-gomes-debaterao-reforma-da-previdencia-sexta-feira-a-assembleia/#respond
getting page: http://blogs.opovo.com.br/politica/2019/03/12/quando-o-acusado-do-crime-mora-no-mesmo-endereco-do-politico/#respond
getting page: http://blogs.opovo.com.br/politica/2019/03/12/ciro-autorizou-mauro-filho-discutir-reforma-porque-eles-nao-tem-proposta-nenhuma/#respond
getting page: http://blogs.opovo.com.br/politica/2019/03/13/advogados-questionam-na-justica-se-houve-ou-nao-quebra-de-sigilos-de-camilo-e-cid-gomes/
getting page: http://blogs.opovo.com.br/politica/2019/03/12/quando-o-acusado-do-crime-mora-no-mesmo-endereco-do-politico/
getting page: http://blogs.opovo.com.br/politica/2019/03/09/eunicio-nega-recebimento-de-propina-da-petrobras-

getting page: http://blogs.opovo.com.br/politica/2019/03/13/advogados-questionam-na-justica-se-houve-ou-nao-quebra-de-sigilos-de-camilo-e-cid-gomes/#respond
getting page: http://blogs.opovo.com.br/politica/2019/03/01/camilo-diz-desconhecer-quebra-de-sigilo-e-afirma-que-nunca-foi-notificado/
getting page: http://blogs.opovo.com.br/politica/2019/03/12/quando-o-acusado-do-crime-mora-no-mesmo-endereco-do-politico/#comment-31387
getting page: http://blogs.opovo.com.br/politica/2019/03/12/quando-o-acusado-do-crime-mora-no-mesmo-endereco-do-politico/#comment-31473
getting page: http://blogs.opovo.com.br/politica/2019/02/25/ciro-gomes-diz-que-tendencia-e-bolsonaro-renunciar-antes-de-terminar-mandato/
getting page: http://blogs.opovo.com.br/politica/2019/03/12/quando-o-acusado-do-crime-mora-no-mesmo-endereco-do-politico/#comment-31380
getting page: http://blogs.opovo.com.br/politica/2019/03/12/quando-o-acusado-do-crime-mora-no-mesmo-endereco-do-politico/#comment-31378
getting page: http://blogs

getting page: https://tecnoblog.net/270486/qualcomm-snapdragon-855-tudo-sobre/
getting page: https://tecnoblog.net/269378/iphone-xs-xs-max-review/
getting page: https://tecnoblog.net/159061/galaxy-star-young-core-2-ace-4-anuncio/
getting page: https://tecnoblog.net/267208/samsung-galaxy-m-fim-linha-galaxy-j-rumor/
getting page: https://tecnoblog.net/88088/lte-4g-como-funciona/
getting page: https://tecnoblog.net/267727/google-pixel-modo-night-sight/
getting page: https://tecnoblog.net/243857/notch-o-que-e-lista-celulares/
getting page: https://tecnoblog.net/279699/resumo-evento-galaxy-s10-fold-buds/
getting page: https://tecnoblog.net/151947/samsung-anuncia-galaxy-s5/
getting page: https://tecnoblog.net/192846/galaxy-s7-edge-review/
getting page: https://tecnoblog.net/184607/galaxy-s6-edge-plus-review/
getting page: https://tecnoblog.net/280665/samsung-fabrica-chip-de-512-gb-com-dobro-da-velocidade/
getting page: https://tecnoblog.net/260237/bem-estar-digital-android-pie/
getting page: