# Experimenting with scrapy

Starting link: https://www.sec.gov/Archives/edgar/data/1047335/000143774920023548/0001437749-20-023548-index.htm  

In [2]:
!pip install -- scrapy

Collecting scrapy
  Downloading Scrapy-2.4.0-py2.py3-none-any.whl (239 kB)
[K     |████████████████████████████████| 239 kB 1.8 MB/s eta 0:00:01
[?25hCollecting queuelib>=1.4.2
  Downloading queuelib-1.5.0-py2.py3-none-any.whl (13 kB)
Collecting w3lib>=1.17.0
  Downloading w3lib-1.22.0-py2.py3-none-any.whl (20 kB)
Collecting Twisted>=17.9.0
  Downloading Twisted-20.3.0-cp37-cp37m-macosx_10_6_intel.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 3.3 MB/s eta 0:00:01
[?25hCollecting PyDispatcher>=2.0.5
  Downloading PyDispatcher-2.0.5.tar.gz (34 kB)
Collecting parsel>=1.5.0
  Downloading parsel-1.6.0-py2.py3-none-any.whl (13 kB)
Collecting zope.interface>=4.1.3
  Downloading zope.interface-5.1.2-cp37-cp37m-macosx_10_9_x86_64.whl (193 kB)
[K     |████████████████████████████████| 193 kB 3.2 MB/s eta 0:00:01
[?25hCollecting protego>=0.1.15
  Downloading Protego-0.1.16.tar.gz (3.2 MB)
[K     |████████████████████████████████| 3.2 MB 1.3 MB/s eta 0:00:01
[?25hCollecting

## General info on 8K publication

## Objectives:
* Scrape company name and CIK number from the filing details
* Follow the link to the actual 8K publication
* Scrape the title of the first item of the publication

In [1]:
import scrapy
from scrapy import Selector
from scrapy.crawler import CrawlerProcess
import requests
import pandas as pd
import logging
import json
from urllib.parse import urljoin, urlparse
print("Libraries imported")

Libraries imported


In [2]:
#json pipeline that will turn the scraped contents into a json file
class JsonWriterPipeline(object):

    def open_spider(self, spider):
        self.file = open('8kform.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

In [3]:
#spider
class Spider_8k(scrapy.Spider):
    name = '8kform'
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, 
        'FEED_FORMAT':'json',                                 
        'FEED_URI': '8kform.json'                        
    }
    
    def start_requests(self):
        url = 'https://www.sec.gov/Archives/edgar/data/1047335/000143774920023548/0001437749-20-023548-index.htm'
        yield scrapy.Request(url=url, callback=self.parse)
        
    # Here I am going to scrape the company info and follow the link to the 8K publication
    def parse(self, response):
        base_url = "https://www.sec.gov/"
        next_page = response.css('#formDiv > div > table > tbody > tr:nth-child(2) > td:nth-child(3) > a::attr(href)').extract()
        yield {
            'Company': response.xpath('//*[@id="filerDiv"]/div[3]/span/text()[1]').extract_first(),
            'CIK': response.xpath('//*[@id="filerDiv"]/div[3]/span/a/text()').extract_first()
        }
        yield scrapy.Request(url=urljoin(base_url, next_page), callback = self.parse2)
        
        
    # Here I scrape the title of an item in the publication
    def parse2(self, response):
        yield {
            'Title': response.xpath('//*[@id="fact-identifier-4"]/b/text()').extract_first()
        }
            
    

In [4]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(Spider_8k)
process.start()

2020-11-13 11:50:34 [scrapy.utils.log] INFO: Scrapy 2.4.0 started (bot: scrapybot)
2020-11-13 11:50:34 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.7.6 (default, Jan  8 2020, 13:42:34) - [Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Darwin-17.0.0-x86_64-i386-64bit
2020-11-13 11:50:34 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2020-11-13 11:50:34 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 30,
 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
  exporter = cls(crawler)



## Trying only with the text

Here I'll just try to scrape the title of an item in the actual 8K publication, to see if my mistake was scraping the title, or following the link. 

In [1]:
import scrapy
from scrapy import Selector
from scrapy.crawler import CrawlerProcess
import requests
import pandas as pd
import logging
import json
from urllib.parse import urljoin, urlparse
print("Libraries imported")

Libraries imported


In [2]:
#json pipeline that will turn the scraped contents into a json file
class JsonWriterPipeline(object):

    def open_spider(self, spider):
        self.file = open('8kform.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

In [3]:
class Spider_8k(scrapy.Spider):
    name = '8kform'
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, 
        'FEED_FORMAT':'json',                                 
        'FEED_URI': '8kform.json'
    }
        
    def start_requests(self):
        url = 'https://www.sec.gov/ix?doc=/Archives/edgar/data/1047335/000143774920023548/nhc20201111_8k.htm'
        yield scrapy.Request(url=url, callback = self.parse)
        
    def parse(self, response):
        yield {
            'Item': response.xpath('//*[@id="dynamic-xbrl-form"]/div[47]/b/text()').extract_first()
        }

In [4]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(Spider_8k)
process.start()

2020-11-13 11:56:26 [scrapy.utils.log] INFO: Scrapy 2.4.0 started (bot: scrapybot)
2020-11-13 11:56:27 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.7.6 (default, Jan  8 2020, 13:42:34) - [Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Darwin-17.0.0-x86_64-i386-64bit
2020-11-13 11:56:27 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2020-11-13 11:56:27 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 30,
 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
  exporter = cls(crawler)

