In [None]:
import pandas as pd
import csv

import scrapy
import logging
import re

from scrapy.item import Item, Field
from scrapy.crawler import CrawlerRunner
from scrapy.exporters import CsvItemExporter
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

from crochet import setup, wait_for

In [None]:
class EditalItem(Item):
    date = Field()
    tags = Field()
    title = Field()
    cover = Field()
    summary = Field()
    results = Field()
    pdf = Field()


class QuoteAllDialect(csv.excel):
    quoting = csv.QUOTE_ALL


class QuoteAllCsvItemExporter(CsvItemExporter):
    def __init__(self, *args, **kwargs):
        kwargs.update({'dialect': QuoteAllDialect})
        super(QuoteAllCsvItemExporter, self).__init__(*args, **kwargs)

In [None]:
class EditaisSpider(scrapy.Spider):
    name = 'secult'
    #max_pages = 2
    #start_urls = ['http://editais.cultura.ce.gov.br/category/editais/page/%s' % page for page in range(1, max_pages + 1)]
    start_urls = ['http://editais.cultura.ce.gov.br/category/editais/']
    delimiter = ';'
    quotechar = "'"
    custom_settings = {
        'FEEDS': {
            'editais.csv': {
                'format': 'csv',
                'overwrite': True
            }
        },
        'FEED_EXPORT_ENCODING': 'utf-8',
        'FEED_EXPORT_FIELDS': ['date', 'tags', 'title', 'cover', 'summary', 'pdf', 'results'],
        'FEED_EXPORTERS': {
            'csv': '__main__.QuoteAllCsvItemExporter',
        }

    }


    def parse(self, response):
        for edital in response.css('article'):
            url = edital.xpath('h2/a/@href').get()

            # Get date from url
            date = url.replace('http://editais.cultura.ce.gov.br/', '').split('/')
            # Format date yyyy-mm-dd
            date = f'{date[0]}-{date[1]}-{date[2]}'
            
            item = EditalItem()
            item['date'] = date
            item['tags'] = edital.xpath('p/a[@rel="category tag"]/text()').extract()

            yield scrapy.Request(url, callback=self.parse_edital, meta={'item':item})

        # Go to next page
        next_page = response.css('div.pagination').xpath('div/a/@href').get()
        if next_page is not None :
            yield response.follow(next_page, self.parse)


    def parse_edital(self, response):
        item = response.meta['item']
        item['title'] = response.css('h1.entry-title::text').get()
        item['cover'] = response.css('article img::attr("src")').get()
        item['summary'] = ''.join(response.css('div.et_pb_text_inner ::text').extract()).strip()
        item['pdf'] = response.css('a.et_pb_button::attr("href")').get(default='')
        item['results'] = response.css('div.et_pb_row.et_pb_row_1 > div.et_pb_column.et_pb_column_1_3.et_pb_column_2.et_pb_css_mix_blend_mode_passthrough > div > div > div.et_pb_blurb_container > div a::attr("href")')[-1].extract()
        yield item


In [None]:
# Reactor restart
setup()
@wait_for(100)
def run_scraper():
    crawler = CrawlerRunner()
    d = crawler.crawl(EditaisSpider)
    return d

In [None]:
# Iniciando processo
run_scraper()

# process = CrawlerProcess()
# process.crawl(EditaisSpider)
# process.start()

In [None]:
df = pd.read_csv('editais.csv')
df

In [None]:
edital_sample = df.sample()
edital_sample