In [52]:
import urllib3
import re
from bs4 import BeautifulSoup
import os
import justext
from nlputils.lexical import Preprocessing

ImportError: No module named 'nlputils'

In [47]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
user_agent = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}

http = urllib3.PoolManager(10, headers=user_agent)


class Crawler:
    
    def __init__(self, corpus_path, max_files, seed_url, url_pattern):
        self.corpus_path = corpus_path
        self.max_files = max_files
        self.seed_url = seed_url
        self.url_pattern = url_pattern
        self.visited_links = {}
        self.to_be_visited = []
        
        if not os.path.exists(self.corpus_path):
            os.makedirs(self.corpus_path)
        
    def crawl(self):
        first_urls = self.get_page(self.seed_url)
        self.add_links(first_urls)
        next_link = self.get_next_link()
        
        file_counter = 1
        while next_link and file_counter < self.max_files:
            links = self.get_page(next_link)
            self.add_links(links)
            next_link = self.get_next_link()
            file_counter += 1
    
    def get_links(self, page_data):
        links = re.findall(self.url_pattern, str(page_data))
        return links            
    
    def get_page(self, url):
        print("getting page {}".format(url))
        response = http.request('GET', url)

        # store text content
        paragraphs = justext.justext(response.data, justext.get_stoplist("Portuguese"))
        with open("{}/{}.txt".format(self.corpus_path, url.replace(".", "_").replace("/","-")), "w") as output_file:
            for paragraph in paragraphs:
                if not paragraph.is_boilerplate:
                    output_file.write(paragraph.text)
        
        # get links
        links = self.get_links(response.data)
        
        return links

    def add_links(self, links):
        links = list(set(links))
        self.to_be_visited.extend([link for link in links if link not in self.visited_links])

    def get_next_link(self):
        next_link = self.to_be_visited.pop(0)
        self.visited_links[next_link] = None
        return next_link

In [48]:
crawler_tecnologia = Crawler("data/corpora/lutas", 500, "https://sportv.globo.com/site/combate/",
                             '"(https://sportv\.globo\.com/site/combate/noticia/.*?)"')
crawler_tecnologia.crawl()

getting page https://sportv.globo.com/site/combate/
getting page https://sportv.globo.com/site/combate/noticia/andrei-arlovski-acusa-augusto-sakai-de-ser-metido-e-aposta-em-disciplina-para-vencer-brasileiro.ghtml
getting page https://sportv.globo.com/site/combate/noticia/ufc-chega-a-fort-lauderdale-e-sera-o-grande-destaque-do-canal-combate-nesta-semana.ghtml
getting page https://sportv.globo.com/site/combate/noticia/irmao-de-fedor-emelianenko-machuca-o-rosto-ao-cair-de-bicicleta-e-deixa-luta-contra-klb.ghtml
getting page https://sportv.globo.com/site/combate/noticia/em-pesagem-animada-jacare-promete-esmagar-hermansson-pela-torcida-brasileira.ghtml
getting page https://sportv.globo.com/site/combate/noticia/andre-sergipano-e-escalado-para-o-contender-americano-e-vai-enfrentar-makhmud-muradov.ghtml
getting page https://sportv.globo.com/site/combate/noticia/com-dana-white-na-plateia-herbert-burns-quer-impressionar-e-fazer-do-titan-o-seu-contender.ghtml
getting page https://sportv.globo.com

getting page https://sportv.globo.com/site/combate/noticia/com-golpe-espetacular-pettis-surpreende-thompson-com-nocaute-no-ufc-nashville.ghtml
getting page https://sportv.globo.com/site/combate/noticia/israel-adesanya-vence-kelvin-gastelum-e-conquista-o-cinturao-interino-dos-medios-do-ufc.ghtml
getting page https://sportv.globo.com/site/combate/noticia/cerrone-fica-com-dois-bonus-e-cejudo-ganha-premio-de-performance-da-noite.ghtml
getting page https://sportv.globo.com/site/combate/noticia/elizeu-capoeira-finaliza-no-primeiro-round-e-emplaca-setima-vitoria-seguida-no-ufc.ghtml
getting page https://sportv.globo.com/site/combate/noticia/pedrita-deixa-olho-de-mccann-fechado-mas-perde-por-decisao-unanime-no-ufc-londres.ghtml
getting page https://sportv.globo.com/site/combate/noticia/poirier-vence-revanche-contra-max-holloway-e-e-campeao-interino-dos-pesos-leves-do-ufc.ghtml
getting page https://sportv.globo.com/site/combate/noticia/com-performance-dominante-jussier-formiga-bate-deiveson-por

getting page https://sportv.globo.com/site/combate/noticia/demian-maia-tem-atuacao-impecavel-e-finaliza-lyman-good-com-mata-leao-no-primeiro-round.ghtml
getting page https://sportv.globo.com/site/combate/noticia/alexandre-pantoja-tem-grande-atuacao-e-nocauteia-wilson-reis-no-primeiro-round-do-ufc-236.ghtml
getting page https://sportv.globo.com/site/combate/noticia/com-joelhadas-demolidoras-overeem-nocauteia-oleynik-no-primeiro-round-no-ufc-russia.ghtml
getting page https://sportv.globo.com/site/combate/noticia/kamaru-usman-massacra-tyron-woodley-e-se-torna-primeiro-africano-campeao-do-ufc.ghtml
getting page https://sportv.globo.com/site/combate/noticia/com-golpe-espetacular-pettis-surpreende-thompson-com-nocaute-no-ufc-nashville.ghtml
getting page https://sportv.globo.com/site/combate/noticia/israel-adesanya-vence-kelvin-gastelum-e-conquista-o-cinturao-interino-dos-medios-do-ufc.ghtml
getting page https://sportv.globo.com/site/combate/noticia/cerrone-fica-com-dois-bonus-e-cejudo-ganha-

getting page https://sportv.globo.com/site/combate/noticia/henry-cejudo-atropela-tj-dillashaw-em-32-segundos-e-mantem-titulo-no-ufc-brooklyn.ghtml
getting page https://sportv.globo.com/site/combate/noticia/demian-maia-tem-atuacao-impecavel-e-finaliza-lyman-good-com-mata-leao-no-primeiro-round.ghtml
getting page https://sportv.globo.com/site/combate/noticia/alexandre-pantoja-tem-grande-atuacao-e-nocauteia-wilson-reis-no-primeiro-round-do-ufc-236.ghtml
getting page https://sportv.globo.com/site/combate/noticia/com-joelhadas-demolidoras-overeem-nocauteia-oleynik-no-primeiro-round-no-ufc-russia.ghtml
getting page https://sportv.globo.com/site/combate/noticia/kamaru-usman-massacra-tyron-woodley-e-se-torna-primeiro-africano-campeao-do-ufc.ghtml
getting page https://sportv.globo.com/site/combate/noticia/com-golpe-espetacular-pettis-surpreende-thompson-com-nocaute-no-ufc-nashville.ghtml
getting page https://sportv.globo.com/site/combate/noticia/israel-adesanya-vence-kelvin-gastelum-e-conquista

getting page https://sportv.globo.com/site/combate/noticia/menifield-domina-mamute-e-vence-por-nocaute-tecnico-no-primeiro-round-no-ufc-brooklyn.ghtml
getting page https://sportv.globo.com/site/combate/noticia/henry-cejudo-atropela-tj-dillashaw-em-32-segundos-e-mantem-titulo-no-ufc-brooklyn.ghtml
getting page https://sportv.globo.com/site/combate/noticia/demian-maia-tem-atuacao-impecavel-e-finaliza-lyman-good-com-mata-leao-no-primeiro-round.ghtml
getting page https://sportv.globo.com/site/combate/noticia/alexandre-pantoja-tem-grande-atuacao-e-nocauteia-wilson-reis-no-primeiro-round-do-ufc-236.ghtml
getting page https://sportv.globo.com/site/combate/noticia/com-joelhadas-demolidoras-overeem-nocauteia-oleynik-no-primeiro-round-no-ufc-russia.ghtml
getting page https://sportv.globo.com/site/combate/noticia/kamaru-usman-massacra-tyron-woodley-e-se-torna-primeiro-africano-campeao-do-ufc.ghtml
getting page https://sportv.globo.com/site/combate/noticia/com-golpe-espetacular-pettis-surpreende-t

getting page https://sportv.globo.com/site/combate/noticia/justin-gaethje-cumpre-o-prometido-e-nocauteia-edson-barboza-no-ufc-filadelfia.ghtml
getting page https://sportv.globo.com/site/combate/noticia/menifield-domina-mamute-e-vence-por-nocaute-tecnico-no-primeiro-round-no-ufc-brooklyn.ghtml
getting page https://sportv.globo.com/site/combate/noticia/henry-cejudo-atropela-tj-dillashaw-em-32-segundos-e-mantem-titulo-no-ufc-brooklyn.ghtml
getting page https://sportv.globo.com/site/combate/noticia/demian-maia-tem-atuacao-impecavel-e-finaliza-lyman-good-com-mata-leao-no-primeiro-round.ghtml
getting page https://sportv.globo.com/site/combate/noticia/alexandre-pantoja-tem-grande-atuacao-e-nocauteia-wilson-reis-no-primeiro-round-do-ufc-236.ghtml
getting page https://sportv.globo.com/site/combate/noticia/com-joelhadas-demolidoras-overeem-nocauteia-oleynik-no-primeiro-round-no-ufc-russia.ghtml
getting page https://sportv.globo.com/site/combate/noticia/kamaru-usman-massacra-tyron-woodley-e-se-to

getting page https://sportv.globo.com/site/combate/noticia/sheymon-moraes-erra-no-fim-e-e-derrotado-por-sodiq-yussuf-na-decisao-unanime-dos-juizes.ghtml
getting page https://sportv.globo.com/site/combate/noticia/justin-gaethje-cumpre-o-prometido-e-nocauteia-edson-barboza-no-ufc-filadelfia.ghtml
getting page https://sportv.globo.com/site/combate/noticia/menifield-domina-mamute-e-vence-por-nocaute-tecnico-no-primeiro-round-no-ufc-brooklyn.ghtml
getting page https://sportv.globo.com/site/combate/noticia/henry-cejudo-atropela-tj-dillashaw-em-32-segundos-e-mantem-titulo-no-ufc-brooklyn.ghtml
getting page https://sportv.globo.com/site/combate/noticia/demian-maia-tem-atuacao-impecavel-e-finaliza-lyman-good-com-mata-leao-no-primeiro-round.ghtml
getting page https://sportv.globo.com/site/combate/noticia/alexandre-pantoja-tem-grande-atuacao-e-nocauteia-wilson-reis-no-primeiro-round-do-ufc-236.ghtml
getting page https://sportv.globo.com/site/combate/noticia/com-joelhadas-demolidoras-overeem-nocau

getting page https://sportv.globo.com/site/combate/noticia/apos-nocaute-avassalador-contra-golm-pavlovich-leva-bonus-por-performance-da-noite.ghtml
getting page https://sportv.globo.com/site/combate/noticia/sheymon-moraes-erra-no-fim-e-e-derrotado-por-sodiq-yussuf-na-decisao-unanime-dos-juizes.ghtml
getting page https://sportv.globo.com/site/combate/noticia/justin-gaethje-cumpre-o-prometido-e-nocauteia-edson-barboza-no-ufc-filadelfia.ghtml
getting page https://sportv.globo.com/site/combate/noticia/menifield-domina-mamute-e-vence-por-nocaute-tecnico-no-primeiro-round-no-ufc-brooklyn.ghtml
getting page https://sportv.globo.com/site/combate/noticia/henry-cejudo-atropela-tj-dillashaw-em-32-segundos-e-mantem-titulo-no-ufc-brooklyn.ghtml
getting page https://sportv.globo.com/site/combate/noticia/demian-maia-tem-atuacao-impecavel-e-finaliza-lyman-good-com-mata-leao-no-primeiro-round.ghtml
getting page https://sportv.globo.com/site/combate/noticia/alexandre-pantoja-tem-grande-atuacao-e-nocaute

getting page https://sportv.globo.com/site/combate/noticia/stefan-struve-resiste-a-pressao-inicial-e-finaliza-marcos-pezao-e-se-aposenta-no-ufc-praga.ghtml
getting page https://sportv.globo.com/site/combate/noticia/apos-nocaute-avassalador-contra-golm-pavlovich-leva-bonus-por-performance-da-noite.ghtml
getting page https://sportv.globo.com/site/combate/noticia/sheymon-moraes-erra-no-fim-e-e-derrotado-por-sodiq-yussuf-na-decisao-unanime-dos-juizes.ghtml
getting page https://sportv.globo.com/site/combate/noticia/justin-gaethje-cumpre-o-prometido-e-nocauteia-edson-barboza-no-ufc-filadelfia.ghtml
getting page https://sportv.globo.com/site/combate/noticia/menifield-domina-mamute-e-vence-por-nocaute-tecnico-no-primeiro-round-no-ufc-brooklyn.ghtml
getting page https://sportv.globo.com/site/combate/noticia/ariane-lipski-sofre-com-luta-agarrada-de-calderwood-e-perde-em-sua-estreia-no-ultimate.ghtml
getting page https://sportv.globo.com/site/combate/noticia/glover-leva-susto-no-inicio-mas-finali

getting page https://sportv.globo.com/site/combate/noticia/justin-gaethje-cumpre-o-prometido-e-nocauteia-edson-barboza-no-ufc-filadelfia.ghtml
getting page https://sportv.globo.com/site/combate/noticia/diego-ferreira-da-aula-de-trocacao-e-defesa-de-queda-e-derrota-ruslam-khabilov-no-ufc-praga.ghtml
getting page https://sportv.globo.com/site/combate/noticia/stefan-struve-resiste-a-pressao-inicial-e-finaliza-marcos-pezao-e-se-aposenta-no-ufc-praga.ghtml
getting page https://sportv.globo.com/site/combate/noticia/edwards-provoca-masvidal-e-os-dois-trocam-socos-nos-bastidores-apos-o-ufc-londres.ghtml
getting page https://sportv.globo.com/site/combate/noticia/jorge-masvidal-surpreende-darren-till-e-nocauteia-rival-na-luta-principal-do-ufc-londres.ghtml
