In [3]:
import scrapy

In [18]:
class Spider12(scrapy.Spider):
    
    name = 'spider12'
    allowed_domains = ['pagina12.com.ar']
    custom_settings = {
        'FEED_FORMAT': 'json',
        'FEED_URI' : 'resultados_scrapy.json',
        'DEPTH_LIMIT': 2,
    }
    start_urls = [
        'https://www.pagina12.com.ar/secciones/el-pais',
         'https://www.pagina12.com.ar/secciones/economia',
         'https://www.pagina12.com.ar/secciones/sociedad',
         'https://www.pagina12.com.ar/suplementos/cultura-y-espectaculos',
         'https://www.pagina12.com.ar/secciones/ciencia',
         'https://www.pagina12.com.ar/secciones/el-mundo',
         'https://www.pagina12.com.ar/secciones/deportes',
         'https://www.pagina12.com.ar/secciones/contratapa',
    ]
    
    def parse(self, response):
        
        # Artículo promocionado
        nota_promocionada = response.xpath('//div[@class="featured-article__container"]/h2/a/@href').get()
        if nota_promocionada is not None:
            yield response.follow(nota_promocionada, callback=self.parse_nota)
        
        # Listado de notas
        notas = response.xpath('//ul[@class="article-list"]//li//a/@href').getall()
        for nota in notas:
            yield response.follow(nota, callback=self.parse_nota)
        
        # Link a la siguiente página
        next_page = response.xpath('//a[@class="pagination-btn-next"]/@href')
        if next_page:
            yield(response.follow(next_page, callback=self.parse))
                                   
    def parse_nota(self, response):
        
        # Parseo de la nota
        title = response.xpath('//div[@class="article-title"]/text()').get()
        date = response.xpath('//span[@pubdate="pubdate"]/@datetime').get()
        summary = response.xpath('//div[@class="article-summary"]/text()').get()
        prefix = response.xpath('//div[@class="article-prefix"]/text()').get()
        body = "\n\n".join(
            response.xpath('//div[@class="article-body"]//@div[@class="article-text"//p/text()]').get_all()
        )
        author = response.xpath('//div[@class="article-author"]//span//a/text()').get()
        
        yield {
            'url': response.url,
            'title': title,
            'date': date,
            'summary': summary,
            'prefix': prefix,
            'body': body,
            'author': author,
        }
        

In [19]:
from scrapy.crawler import CrawlerProcess

In [20]:
process = CrawlerProcess()
process.crawl(Spider12v2)
process.start()

2019-12-05 16:39:38 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: scrapybot)
2019-12-05 16:39:38 [scrapy.utils.log] INFO: Versions: lxml 4.4.2.0, libxml2 2.9.5, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.7.4 (tags/v3.7.4:e09359112e, Jul  8 2019, 19:29:22) [MSC v.1916 32 bit (Intel)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Windows-10-10.0.18362-SP0
2019-12-05 16:39:38 [scrapy.crawler] INFO: Overridden settings: {'DEPTH_LIMIT': 2, 'FEED_FORMAT': 'json', 'FEED_URI': 'resultados_scrapy.json'}
2019-12-05 16:39:38 [scrapy.extensions.telnet] INFO: Telnet Password: d974909fe79abc4a
2019-12-05 16:39:38 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2019-12-05 16:39:38 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.htt

ReactorNotRestartable: 

In [8]:
!ls

chromedriver.exe
Notas_Pagina12.csv
resultados_scrapy.json
web_scraping_01_bs4_and_requests.ipynb
web_scraping_02_selenium.ipynb
web_scraping_03_APIs.ipynb
web_scraping_04_scrapy.ipynb
xpath-cheatsheet-selenium.pdf


In [9]:
!cat resultados_scrapy.json