In [None]:
!pip install spacy

!python -m spacy download en

In [None]:
import spacy
from bs4 import BeautifulSoup
import urllib.request
import feedparser
from datetime import datetime
import unicodedata
%matplotlib inline

In [None]:
nlp = spacy.load('en')

In [None]:
def read_sec_feed():
    d = feedparser.parse('https://www.sec.gov/rss/litigation/litreleases.xml')
    
    return [LitigationRelease(entry) for entry in d.entries]
       
    

In [None]:
class LitigationRelease(object):
    
    def __init__(self, entry_dict):
        # Everything in this top section is available in the entry_dict argument meaning
        # it could be taken from the RSS feed directly
        self.id = entry_dict['id']
        self.title = entry_dict['title']
        self.url = self._validate_url(entry_dict['link'])
        self.publish_date = self._parse_publish_date(entry_dict['published'][:-4]) # chop the 3 char time zone 
        
        # Everything below here requires calling read_url to load from the actual litigation release 
        # as opposed to being available in the original dictionary
        self.html = None
        self.paragraphs = None
        self.entities = None
        self.entity_paragraph_slice = slice(None, None) # default to using all paragraphs
    
    def _parse_publish_date(self, date_string):
        """ Given a date string pulled from the SEC RSS feed, parses it into a datetime object.  In most cases
        the date string will be in the format "DayOfWeek, Day Month Year Hour:Minute:Second.  However, in some cases
        the seconds are omitted.  It's necessary to use two different strptime format strings to parse it, which 
        is accounted for in this method.
        """
        try:
            return datetime.strptime(date_string, '%a, %d %b %Y %H:%M:%S')
        except:
            return datetime.strptime(date_string, '%a, %d %b %Y %H:%M')
    
    def _validate_url(self, url):
        if self.id == 'LR-23731': # the SEC feed for 23731 has a bad URL.  This is a hack to fix it like this.
            return 'https://www.sec.gov/litigation/litreleases/2017/lr23731.htm'
        else:
            return url
    
    def read_url(self):       
        
        try_count = 0
        
        while try_count < 3:
        
            try:
                self._try_read_url()
                return
            except Exception as inst:
                print('Error on ID {} @ {}.\n{}\n Retry # {}'.format(self.id, self.url, inst, try_count + 1))
                try_count += 1
                
    def _try_read_url(self):
        """ Opens the URl at self.url which is treated as an SEC Litigation Release.  The body is parsed
        to locate all paragraphs containing potential entities.  The format changed at the start of 2018
        so that we use 2 different approaches.  For 2018 releases, we look for an ID and then ignore
        the last paragraph (which is SEC internal attribution).  For 2017 releases there is no such 
        ID so we have to just take all paragraphs.  We also have to ignore the last 3 paragraphs since
        there are 2 additional ones from the 2018 version.  
        """
        with urllib.request.urlopen(self.url) as response:
            self.html = response.read()

            soup = BeautifulSoup(self.html, 'html.parser')

            body = soup.find(id='main-content')

            if body is not None:        
                self.paragraphs = [unicodedata.normalize('NFKD', p.text) for p in body.find_all('p')]     
                self.entity_paragraph_slice = slice(None, -1)
            else:            
                self.paragraphs = [unicodedata.normalize('NFKD', p.text) for p in soup.find_all('p')]  
                self.entity_paragraph_slice = slice(None, -4)
    
    def parse_entities(self, parser):
        """ Assuming read_url() has already been called, parse_entities will iterate through self.paragraphs
        calling parser on each one.  Parser must be able to convert each paragraph (a string) to a collection
        of tuples in the form (ENTITY, LABEL).  The tuples are added to self.entities, which is a set to prevent
        duplication.  If read_url() has not been called (based on self.paragraphs being None), it returns
        without error and without altering self.entities.        
        """
        
        if self.paragraphs is None:
            return
        
        self.entities = set([])
       
        for paragraph in self.paragraphs[self.entity_paragraph_slice]:
            
            new_entities = parser(paragraph)
            
            for ent in new_entities:            
                self.entities.add(ent)
    
    def __str__(self):
        return ' - '.join([self.id, self.title])
        

In [None]:
class EntityParser(object):
    
    def __init__(self, nlp, included_types = None):
        self.nlp = nlp
        
        if included_types is not None:
            self.included_types = included_types
        else:
            self.included_types = ['ORG', 'PERSON']
        
    def __call__(self, text):
        
        entities = [(ent.text, ent.label_) for ent in self.nlp(text).ents]
        
        return self._filter_entities(entities)
    
    def _filter_entities(self, entities):
        return [e for e in entities if e[1] in self.included_types]
        

In [None]:
def process_release(release, entity_parser):
    
    release.read_url()
    
    release.parse_entities(entity_parser)
    

def save_release_entities(releases, csv_filename):
    
    import csv
   
    def write_release_entities(writer, release):
        row = [release.id, str(release.publish_date), release.title, ] + \
                ['{} ({})'.format(name, label) for (name, label) in release.entities]
        
        encoded_row = [r.encode('ascii', 'ignore').decode('ascii') for r in row]
        
        writer.writerow(encoded_row)
    
    with open(csv_filename, 'w') as csv_file:
        writer = csv.writer(csv_file, delimiter = '|')
        
        for release in releases:
            write_release_entities(writer, release)    

In [None]:
def go():
    parser = EntityParser(nlp)

    releases = read_sec_feed()

    for release in releases:
        process_release(release, parser) 

    save_release_entities(releases, 'sec-feed-entities.csv')
    
    
    