In [None]:
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
import logging

import time

class BlogsSpider(scrapy.Spider):
    
    configure_logging(install_root_handler=False)
    
    custom_settings = {
        'DOWNLOAD_DELAY': 1,
        'FEED_URI': './results/blogs_result.json',
        'FEED_FORMAT': 'json'
    }
    
    logging.basicConfig(
        filename='./logs/scrapy_blogs_log.txt',
        format='%(levelname)s: %(message)s',
        level=logging.INFO
    )

    name = "seeking"
    start_urls = [
        'https://seekingalpha.com/latest-articles',
    ]

    def parse(self, response):
        for article in response.xpath("//ul[@class='articles-list']/li"):
            article_link=article.xpath(".//div[@class='media-body']/a[@class='a-title']/@href").extract_first()
            yield response.follow(url=article_link, callback=self.parse_page)
        
        next_page=response.xpath("//li[@class='next']/a/@href").extract_first()
        if next_page is not None:
            time.sleep(30)
            next_page_link= response.urljoin(next_page)
            yield scrapy.Request(url=next_page_link, callback=self.parse)

    def parse_page(self, response):
        yield { 
            'article_name': response.xpath("//h1[@itemprop='headline']/text()").extract_first(),
            'include': response.xpath("//*[@id='a-hd']/div[2]/span[3]/span[2]/@title").extract(),
            'time': response.xpath("//*[@id='a-hd']/div[2]/time/@content").extract_first(),
            'author': response.xpath("//*[@id='author-hd']/div[2]/div[1]/div[1]/a/span/text()").extract_first(),
            'about': response.xpath("//*[@id='about_primary_stocks']").extract(),
            'theme': response.xpath("//*[@class='a-themes']").extract(),
            'summary' : response.xpath("string(//div[@class='a-sum'])").extract(),
            'article' : response.xpath("string(//div[@itemprop='articleBody'])").extract()
        }

In [None]:
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess

settings = get_project_settings()
settings.overrides['FEED_FORMAT'] = 'json'
settings.overrides['FEED_URI'] = './results/blogs_result.json'

process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(BlogsSpider)
process.start() 