<a href="https://colab.research.google.com/github/daniloaleixo/UOLCrawler/blob/master/UOLCrawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# UOL Crawler
 

In [51]:
# Settings for notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Show Python version
import platform
platform.python_version()

'3.6.9'

In [0]:
# Import scrapy
try:
    import scrapy
except:
    !pip install scrapy
    import scrapy
from scrapy.crawler import CrawlerProcess

In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import re
import lxml
import urllib.request
import urllib.parse
import http.cookiejar
from lxml.html import fragment_fromstring
import scrapy

In [0]:
all_links = {}

Prepare the requests

In [0]:
url = "https://www.uol.com.br/"
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'),('Accept', ' text/html, text/plain, text/css, text/sgml, */*; q=0.01')]

## Making the first run

In [0]:
link = opener.open(url)
content = link.read().decode("ISO-8859-1") 

In [0]:
pattern = re.compile('<a href="(https://www.uol.com.br/[^ ]*)"', re.DOTALL)
reg = re.findall(pattern, content)

In [6]:
reg[:5]

['https://www.uol.com.br/',
 'https://www.uol.com.br/mov/retrato',
 'https://www.uol.com.br/mov/mov-doc',
 'https://www.uol.com.br/podcasts',
 'https://www.uol.com.br/vivabem/podcast/maratona/']

I'll check all the links with not visited

In [0]:
for link in reg:
  if link not in all_links:
    all_links[str(link)] = 0

The only one that I really visited I'll set to 1

In [0]:
all_links["https://www.uol.com.br/"] = 1

# Looping through all the links

Now I'll loop through all the non-visited links and get their links (for tests purposes I'll limit the size of the dictionary to 1000 links

In [9]:
pass_index = 2
pattern = re.compile('<a href="(https://www.uol.com.br/[^ ]*)"', re.DOTALL)


while all(value == 1 for value in all_links.values()) == False and len(all_links) < 1000: 

  # Get the next url
  next_url = ""
  for link in all_links.keys():
    if all_links[link] == 0:
      next_url = link
      break;

  print('Passing to link number ', pass_index, " -> ", next_url, " | all_links size: ", len(all_links))  
  
  link = opener.open(next_url)
  content = link.read().decode("ISO-8859-1")
  reg = re.findall(pattern, content)

  # Include to all links
  for link in reg:
    if link not in all_links:
      all_links[str(link)] = 0

  pass_index = pass_index + 1
  all_links[next_url] = 1


Passing to link number  2  ->  https://www.uol.com.br/mov/retrato  | all_links size:  124
Passing to link number  3  ->  https://www.uol.com.br/mov/mov-doc  | all_links size:  148
Passing to link number  4  ->  https://www.uol.com.br/podcasts  | all_links size:  178
Passing to link number  5  ->  https://www.uol.com.br/vivabem/podcast/maratona/  | all_links size:  192
Passing to link number  6  ->  https://www.uol.com.br/esporte/podcast/posse-de-bola/  | all_links size:  196


KeyboardInterrupt: ignored

In [10]:
len(all_links)

196

# Extracting text


Based on tutorial https://www.jitsejan.com/using-scrapy-in-jupyter-notebook.html

### Setup a pipeline

This class creates a simple pipeline that writes all found items to a JSON file, where each line contains one JSON element.

In [0]:
import json

class JsonWriterPipeline(object):

    def open_spider(self, spider):
        self.file = open('quoteresult.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

### Define the spider

In [0]:
import logging

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'http://quotes.toscrape.com/page/1/',
        'http://quotes.toscrape.com/page/2/',
    ]
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
        'FEED_FORMAT':'json',                                 # Used for pipeline 2
        'FEED_URI': 'quoteresult.json'                        # Used for pipeline 2
    }
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('span small::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract(),
            }

### Start the crawler

In [57]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(QuotesSpider)
process.start()

2019-12-07 15:03:54 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: scrapybot)
2019-12-07 15:03:54 [scrapy.utils.log] INFO: Versions: lxml 4.2.6.0, libxml2 2.9.8, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.6.9 (default, Nov  7 2019, 10:44:02) - [GCC 8.3.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Linux-4.14.137+-x86_64-with-Ubuntu-18.04-bionic
2019-12-07 15:03:54 [scrapy.crawler] INFO: Overridden settings: {'FEED_FORMAT': 'json', 'FEED_URI': 'quoteresult.json', 'LOG_LEVEL': 30, 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


<Deferred at 0x7f9c9e1e15c0>

In [58]:
ll quoteresult.*

-rw-r--r-- 1 root 5551 Dec  7 15:03 quoteresult.jl
-rw-r--r-- 1 root 5573 Dec  7 15:03 quoteresult.json


In [59]:
!tail -n 2 quoteresult.jl

{"text": "\u201cA woman is like a tea bag; you never know how strong it is until it's in hot water.\u201d", "author": "Eleanor Roosevelt", "tags": ["misattributed-eleanor-roosevelt"]}
{"text": "\u201cA day without sunshine is like, you know, night.\u201d", "author": "Steve Martin", "tags": ["humor", "obvious", "simile"]}


In [60]:
!tail -n 2 quoteresult.json

{"text": "\u201cA day without sunshine is like, you know, night.\u201d", "author": "Steve Martin", "tags": ["humor", "obvious", "simile"]}
]

### Run for one page

I'll do it just for one page for now

In [0]:
class BlogSpider(scrapy.Spider):
    name = 'blogspider'
    start_urls = ['https://blog.scrapinghub.com']

    def parse(self, response):
        for title in response.css('.post-header>h2'):
            yield {'title': title.css('a ::text').get()}

        for next_page in response.css('a.next-posts-link'):
            yield response.follow(next_page, self.parse)

In [50]:
!scrapy runspider myspider.py

2019-12-07 14:56:08 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: scrapybot)
2019-12-07 14:56:08 [scrapy.utils.log] INFO: Versions: lxml 4.2.6.0, libxml2 2.9.8, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.6.9 (default, Nov  7 2019, 10:44:02) - [GCC 8.3.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Linux-4.14.137+-x86_64-with-Ubuntu-18.04-bionic
Usage
=====
  scrapy runspider [options] <spider_file>

runspider: error: File not found: myspider.py



In [0]:
url = "https://www1.folha.uol.com.br/poder/2019/12/cgu-diz-que-inexiste-documento-citado-por-bolsonaro-sobre-laranjas-do-psl.shtml"
link = opener.open(url)
content = link.read().decode("utf-8")


In [34]:
content[:5000]

'\n\n\n\n\n\n<!doctype html>\n<html lang="pt-BR" data-version="prod@a4ccb1b7">\n  <head>\n    <meta charset="utf-8">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge">\n    <meta property="desliga:paywall" content="false">\r\n    <title>CGU diz que inexiste documento citado por Bolsonaro sobre laranjas do PSL - 07/12/2019 - Poder - Folha</title>\n  <meta name="description" content="Manifestação é resposta ao recurso feito pela Folha a partir de pedidos recusados por Moro dentro da Lei de Acesso à Informação">\n  <meta name="keywords" content="PSL, Jair Bolsonaro, laranjas do PSL, Gustavo Bebianno, Luciano Bivar, governo bolsonaro, folha  ">\n    <meta http-equiv="Content-Security-Policy" content="upgrade-insecure-requests">\n  \n      <link rel="canonical" href="https://www1.folha.uol.com.br/poder/2019/12/cgu-diz-que-inexiste-documento-citado-por-bolsonaro-sobre-laranjas-do-psl.shtml">\n  \n        <link rel="amphtml" href="https://www1.folha.uol.com.br/amp/poder/2019/12/cgu-di

In [0]:
pattern = re.compile('<body>(.*)</body>', re.DOTALL)
reg = re.findall(pattern, content)

In [0]:
reg = "<div>" + reg[0] + "</div>"

In [41]:
page = fragment_fromstring(content); page

# pattern = re.compile('<a href="(https://www.uol.com.br/[^ ]*)"', re.DOTALL)
# reg = re.findall(pattern, content)
# possible_classes = ["c-news__content"]


ParserError: ignored