# Corpus compilation
- text type and genres
- characteristics according to task

In [4]:
import urllib3
import re
from bs4 import BeautifulSoup
import os

In [8]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
user_agent = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}

http = urllib3.PoolManager(10, headers=user_agent)


class Crawler:
    
    def __init__(self, corpus_path, max_files, seed_url, url_pattern, clean_with_heuristics=True):
        self.corpus_path = corpus_path
        self.max_files = max_files
        self.seed_url = seed_url
        self.url_pattern = url_pattern
        self.clean_with_heuristics = clean_with_heuristics
        self.visited_links = {}
        self.to_be_visited = []
        
        if not os.path.exists(self.corpus_path):
            os.makedirs(self.corpus_path)
        
    def crawl(self):
        first_urls = self.get_page(self.seed_url)
        self.add_links(first_urls)
        next_link = self.get_next_link()
        
        file_counter = 1
        while next_link and file_counter < self.max_files:
            links = self.get_page(next_link)
            self.add_links(links)
            next_link = self.get_next_link()
            file_counter += 1

    def heuristics_clean(self, lines):
        # lines with one or two word only
        lines = [line for line in lines if len(line.split()) > 3]
        return lines

    def get_page(self, url):
        print("getting page {}".format(url))
        page = http.urlopen("GET", url)
        soup = BeautifulSoup(page.data, 'html.parser')

        # remove some undesired tags
        for script in soup(["script", "style", "meta", "title", "head"]):
            script.extract()
        text = soup.get_text()

        # remove blank lines
        lines = [line for line in text.split("\n") if len(line.strip()) > 0]

        # get links
        links = [link.get('href') for link in soup.findAll('a', attrs={'href': re.compile(self.url_pattern)})]

        if self.clean_with_heuristics:
            lines = self.heuristics_clean(lines)

        # store text content
        with open("{}/{}.txt".format(self.corpus_path, url.replace(".", "_").replace("/","-")), "w") as output_file:
            output_file.write("\n".join(lines))

        return links

    def add_links(self, links):
        links = list(set(links))
        self.to_be_visited.extend([link for link in links if link not in self.visited_links])

    def get_next_link(self):
        next_link = self.to_be_visited.pop(0)
        self.visited_links[next_link] = None
        return next_link

In [9]:
crawler_tecnologia = Crawler("data/corpora/tecnologia", 500, "https://tecnoblog.net/281950/", "^https://tecnoblog\.net/\d+")
crawler_politica = Crawler("data/corpora/politica", 500, "http://blogs.opovo.com.br/politica/",
                           "^http://blogs\.opovo\.com\.br/politica/\d+")

In [None]:
crawler_politica.crawl()
crawler_tecnologia.crawl()

getting page http://blogs.opovo.com.br/politica/
getting page http://blogs.opovo.com.br/politica/2019/04/03/em-israel-bolsonaro-critica-leis-de-desarmamento-conheca-a-legislacao-a-respeito-israelense-sobre-posse-e-porte/
getting page http://blogs.opovo.com.br/politica/2019/04/09/deputado-cearense-quer-convocar-novo-ministro-da-educacao-para-prestar-esclarecimentos/
getting page http://blogs.opovo.com.br/politica/2019/04/09/deputado-cearense-quer-convocar-novo-ministro-da-educacao-para-prestar-esclarecimentos/#respond
getting page http://blogs.opovo.com.br/politica/2019/04/09/projeto-regula-jogos-eletronicos-como-esporte-e-gamers-como-atletas/
getting page http://blogs.opovo.com.br/politica/2019/04/08/quem-e-o-novo-ministro-da-educacao-de-jair-bolsonaro/
getting page http://blogs.opovo.com.br/politica/2019/04/08/analise-escolha-de-olavista-para-o-mec-pode-ampliar-desgaste-com-militares/
getting page http://blogs.opovo.com.br/politica/2019/04/03/guedes-diz-que-quem-discorda-da-necessidad

getting page http://blogs.opovo.com.br/politica/2017/10/19/cearenses-votam-favor-de-temer-na-ccj/
getting page http://blogs.opovo.com.br/politica/2019/04/03/fraca-timida-e-opaca-diz-guimaraes-sobre-atuacao-de-guedes-na-ccjc/#respond
getting page http://blogs.opovo.com.br/politica/2019/03/27/nao-vou-jogar-domino-com-lula-e-temer-no-xadrez-diz-bolsonaro-sobre-articulacao/
getting page http://blogs.opovo.com.br/politica/2019/04/03/em-israel-bolsonaro-critica-leis-de-desarmamento-conheca-a-legislacao-a-respeito-israelense-sobre-posse-e-porte/#comment-32650
getting page http://blogs.opovo.com.br/politica/2019/04/03/em-israel-bolsonaro-critica-leis-de-desarmamento-conheca-a-legislacao-a-respeito-israelense-sobre-posse-e-porte/#comment-32901
getting page http://blogs.opovo.com.br/politica/2019/03/28/bolsonaro-muda-tom-e-diz-que-ideia-e-rememorar-nao-comemorar-golpe-de-1964/
getting page http://blogs.opovo.com.br/politica/2019/04/08/bolsonaro-demite-ministro-da-educacao-e-anuncia-substituto/
g

getting page http://blogs.opovo.com.br/politica/2019/03/28/bolsonaro-muda-tom-e-diz-que-ideia-e-rememorar-nao-comemorar-golpe-de-1964/#comment-32296
getting page http://blogs.opovo.com.br/politica/2019/03/28/bolsonaro-muda-tom-e-diz-que-ideia-e-rememorar-nao-comemorar-golpe-de-1964/#comment-32307
getting page http://blogs.opovo.com.br/politica/2019/03/28/bolsonaro-muda-tom-e-diz-que-ideia-e-rememorar-nao-comemorar-golpe-de-1964/#comment-32301
getting page http://blogs.opovo.com.br/politica/2019/03/28/bolsonaro-muda-tom-e-diz-que-ideia-e-rememorar-nao-comemorar-golpe-de-1964/#comment-32295
getting page http://blogs.opovo.com.br/politica/2019/03/28/bolsonaro-muda-tom-e-diz-que-ideia-e-rememorar-nao-comemorar-golpe-de-1964/#respond
getting page http://blogs.opovo.com.br/politica/2017/10/06/capitao-wagner-volta-atras-com-projeto-que-proibe-venda-de-armas-de-brinquedo/#respond
getting page http://blogs.opovo.com.br/politica/2017/10/06/capitao-wagner-volta-atras-com-projeto-que-proibe-venda-