<h1>Scrapy

In [1]:
#importando bibliotecas
import scrapy
from scrapy.crawler import CrawlerProcess
import json
import logging
import pandas as pd

In [2]:
#Criando uma classe que irá armazenar as nossas quotes em um arquivo .json
#Neste caso, o arquivo Julia (.jl) será um arquivo com um .json em cada linha

class JsonWriterPipeline(object):
    
    def open_spider(self, spider):
        self.file = open('quoteresult.jl', 'w')
    
    def close_spider(self, spider):
        self.file.close()
    
    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

In [3]:
#Criando uma classe Spider

class QuotesSpider(scrapy.Spider):
    
    name = "quotes"
    
    start_urls = ['http://quotes.toscrape.com/page/' + str(n) +'/' for n in range(1,5)]
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
        'FEED_FORMAT':'json',                                 # Used for pipeline 2
        'FEED_URI': 'quoteresult.json'                        # Used for pipeline 2
    }
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('span small::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract(),
            }

In [4]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
})

#para descobrir o user agent: https://www.whatsmyua.info/

process.crawl(QuotesSpider)
process.start()

2021-05-18 16:54:39 [scrapy.utils.log] INFO: Scrapy 2.4.1 started (bot: scrapybot)
2021-05-18 16:54:39 [scrapy.utils.log] INFO: Versions: lxml 4.6.1.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.2.0, Python 3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1h  22 Sep 2020), cryptography 3.1.1, Platform Windows-10-10.0.19041-SP0
2021-05-18 16:54:39 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2021-05-18 16:54:39 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 30,
 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
               '(KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
  exporter = cls(crawler)



In [5]:
#Se tudo funcionou, você deve ter um arquivo Julia em um JSON
#Com eles, conseguimos criar um dataframe

dfjson = pd.read_json('quoteresult.json')
dfjson

Unnamed: 0,text,author,tags
0,"“I love you without knowing how, or when, or f...",Pablo Neruda,"[love, poetry]"
1,“For every minute you are angry you lose sixty...,Ralph Waldo Emerson,[happiness]
2,"“If you judge people, you have no time to love...",Mother Teresa,[attributed-no-source]
3,“Anyone who thinks sitting in church can make ...,Garrison Keillor,"[humor, religion]"
4,“Beauty is in the eye of the beholder and it m...,Jim Henson,[humor]
5,"“Today you are You, that is truer than true. T...",Dr. Seuss,"[comedy, life, yourself]"
6,"“If you want your children to be intelligent, ...",Albert Einstein,"[children, fairy-tales]"
7,“It is impossible to live without failing at s...,J.K. Rowling,[]
8,“Logic will get you from A to Z; imagination w...,Albert Einstein,[imagination]
9,"“One good thing about music, when it hits you,...",Bob Marley,[music]
