# Breaking down Seppe's code

## Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
from bs4 import NavigableString
from htmllaundry import sanitize
from htmllaundry.cleaners import LaundryCleaner
import htmllaundry.utils
import xmltodict
import re
import json
from pprint import pprint
import pandas as pd
from glob import glob
from cachecontrol import CacheControl
from IPython.display import HTML
import unicodedata

In [2]:
sess = requests.session()
cach = CacheControl(sess)

## Cleaning SEC encoding

In [3]:
CustomCleaner = LaundryCleaner(
            page_structure=False,
            remove_unknown_tags=False,
            allow_tags=['blockquote', 'a', 'i', 'em', 'p', 'b', 'strong',
                        'h1', 'h2', 'h3', 'h4', 'h5', 
                        'ul', 'ol', 'li', 
                        'sub', 'sup',
                        'abbr', 'acronym', 'dl', 'dt', 'dd', 'cite',
                        'dft', 'br', 
                        'table', 'tr', 'td', 'th', 'thead', 'tbody', 'tfoot'],
            safe_attrs_only=True,
            add_nofollow=True,
            scripts=True,
            javascript=True,
            comments=True,
            style=True,
            links=False,
            meta=True,
            processing_instructions=False,
            frames=True,
            annoying_tags=False)

In [60]:
## The SEC is encoded in CP1252, and it is recommended to use UTF-8 always.
## see: https://www.w3.org/International/questions/qa-what-is-encoding
###### https://www.w3.org/International/articles/definitions-characters/#unicode
###### https://www.w3.org/International/questions/qa-choosing-encodings

def reformat_cp1252(match):
    codePoint = int(match.group(1))
    if 128 <= codePoint <= 159:
        return bytes([codePoint])
    else:
        return match.group()

def clean_sec_content(binary):
    return re.sub(b'&#(\d+);', reformat_cp1252, binary, flags=re.I).decode("windows-1252").encode('utf-8').decode('utf-8')

In [59]:
## this is to normalize urls, making them more human friendly
def slugify(value):
    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    value = re.sub('[^\w\s\.\-]', '-', value).strip().lower()
    value = re.sub('[-\s]+', '-', value)
    return value

## Cleaning html

maybe I don't have to use exactly this functions

In [6]:
def read_html(file):
    with open(file, 'r') as f: return f.read()

In [7]:
def clean_html(html):
    soup = BeautifulSoup(html)
    if not soup.find('p'):
        for div in soup.find_all('div'):
            div.name = 'p'
    for b in soup.find_all('b'):
        b.name = 'strong'
    for f in soup.find_all('font', style=re.compile('font-weight:\s*bold')):
        f.name = 'strong'
    for footer in soup.find_all(class_=['header', 'footer']): 
        try: footer.decompose()
        except: pass
    san = sanitize(str(soup), CustomCleaner)
    soup = BeautifulSoup(san)
    def decompose_parent(el, parent='p', not_grandparent='table'):
        try:
            parent = el.find_parent(parent)
        except: parent = None
        if not parent: return
        grandparent = parent.find_parent('table')
        if grandparent: return
        parent.decompose()
    for el in soup.find_all(text=lambda x: 'table of contents' == str(x).lower().strip()):
        decompose_parent(el, 'a')
    for el in soup.find_all(text=re.compile(r'^\s*S\-(\d+|[ivxlcdm]+)\s*$')): 
        decompose_parent(el, 'p')
    for el in soup.find_all(text=re.compile(r'^\s*\d+\s*$')): 
        decompose_parent(el, 'p')
    return soup

## Defining helper functions

In [8]:
def pagination_provider_by_element_start_count(find_args, find_kwargs):
    def pagination_provider_by_element_start_count_wrapped(soup, params):
        if soup.find(*find_args, **find_kwargs) is None:
            return None
        params['start'] += params['count']
        return params
    return pagination_provider_by_element_start_count_wrapped

In [9]:
def params_provider_by_dict(params):
    return lambda : params

In [10]:
## look for a table, gets rid of HTML tags, removes line breaks
def table_provider_by_summary(summary, header=0, index_col=0):
    return lambda soup: pd.read_html(
        str(soup.find('table', summary=summary)).replace('<br>', '<br>\n'), header=header, index_col=index_col)[0]

#### Breaking down table_provider_by_summary

In [39]:
url = base_url.format('/Archives/edgar/data/1035443/0001047469-19-001263-index.html')
url

'https://www.sec.gov/Archives/edgar/data/1035443/0001047469-19-001263-index.html'

In [44]:
soup = BeautifulSoup(cach.get(url).text)
soup

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<title>EDGAR Filing Documents for 0001047469-19-001263</title>
<link href="/include/interactive.css" rel="stylesheet" type="text/css"/>
</head>
<body style="margin: 0">
<!-- SEC Web Analytics - For information please visit: https://www.sec.gov/privacy.htm#collectedinfo -->
<noscript><iframe height="0" src="//www.googletagmanager.com/ns.html?id=GTM-TD3BKV" style="display:none;visibility:hidden" width="0"></iframe></noscript>
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
'//www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
})(window,document,'script','dataLayer','GTM-TD3BKV');</scrip

In [51]:
var= soup.find('table')
var

<table class="tableFile" summary="Document Format Files">
<tr>
<th scope="col" style="width: 5%;"><acronym title="Sequence Number">Seq</acronym></th>
<th scope="col" style="width: 40%;">Description</th>
<th scope="col" style="width: 20%;">Document</th>
<th scope="col" style="width: 10%;">Type</th>
<th scope="col">Size</th>
</tr>
<tr>
<td scope="row">1</td>
<td scope="row">424B5</td>
<td scope="row"><a href="/Archives/edgar/data/1035443/000104746919001263/a2238051z424b5.htm">a2238051z424b5.htm</a></td>
<td scope="row">424B5</td>
<td scope="row">644742</td>
</tr>
<tr class="blueRow">
<td scope="row">2</td>
<td scope="row">G136386.JPG</td>
<td scope="row"><a href="/Archives/edgar/data/1035443/000104746919001263/g136386.jpg">g136386.jpg</a></td>
<td scope="row">GRAPHIC</td>
<td scope="row">11919</td>
</tr>
<tr>
<td scope="row">3</td>
<td scope="row">G71556.JPG</td>
<td scope="row"><a href="/Archives/edgar/data/1035443/000104746919001263/g71556.jpg">g71556.jpg</a></td>
<td scope="row">GRAPH

In [53]:
var = str(soup.find('table'))
var

'<table class="tableFile" summary="Document Format Files">\n<tr>\n<th scope="col" style="width: 5%;"><acronym title="Sequence Number">Seq</acronym></th>\n<th scope="col" style="width: 40%;">Description</th>\n<th scope="col" style="width: 20%;">Document</th>\n<th scope="col" style="width: 10%;">Type</th>\n<th scope="col">Size</th>\n</tr>\n<tr>\n<td scope="row">1</td>\n<td scope="row">424B5</td>\n<td scope="row"><a href="/Archives/edgar/data/1035443/000104746919001263/a2238051z424b5.htm">a2238051z424b5.htm</a></td>\n<td scope="row">424B5</td>\n<td scope="row">644742</td>\n</tr>\n<tr class="blueRow">\n<td scope="row">2</td>\n<td scope="row">G136386.JPG</td>\n<td scope="row"><a href="/Archives/edgar/data/1035443/000104746919001263/g136386.jpg">g136386.jpg</a></td>\n<td scope="row">GRAPHIC</td>\n<td scope="row">11919</td>\n</tr>\n<tr>\n<td scope="row">3</td>\n<td scope="row">G71556.JPG</td>\n<td scope="row"><a href="/Archives/edgar/data/1035443/000104746919001263/g71556.jpg">g71556.jpg</a><

In [54]:
## <BR> tags denote line breaks, so here you get rid of them
var = str(soup.find('table')).replace('<br>', '<br>\n')
var

'<table class="tableFile" summary="Document Format Files">\n<tr>\n<th scope="col" style="width: 5%;"><acronym title="Sequence Number">Seq</acronym></th>\n<th scope="col" style="width: 40%;">Description</th>\n<th scope="col" style="width: 20%;">Document</th>\n<th scope="col" style="width: 10%;">Type</th>\n<th scope="col">Size</th>\n</tr>\n<tr>\n<td scope="row">1</td>\n<td scope="row">424B5</td>\n<td scope="row"><a href="/Archives/edgar/data/1035443/000104746919001263/a2238051z424b5.htm">a2238051z424b5.htm</a></td>\n<td scope="row">424B5</td>\n<td scope="row">644742</td>\n</tr>\n<tr class="blueRow">\n<td scope="row">2</td>\n<td scope="row">G136386.JPG</td>\n<td scope="row"><a href="/Archives/edgar/data/1035443/000104746919001263/g136386.jpg">g136386.jpg</a></td>\n<td scope="row">GRAPHIC</td>\n<td scope="row">11919</td>\n</tr>\n<tr>\n<td scope="row">3</td>\n<td scope="row">G71556.JPG</td>\n<td scope="row"><a href="/Archives/edgar/data/1035443/000104746919001263/g71556.jpg">g71556.jpg</a><

In [55]:
var = pd.read_html(str(soup.find('table')).replace('<br>', '<br>\n'), header=0, index_col =0)
var

[                       Description                  Document     Type    Size
 Seq                                                                          
 1.0                          424B5        a2238051z424b5.htm    424B5  644742
 2.0                    G136386.JPG               g136386.jpg  GRAPHIC   11919
 3.0                     G71556.JPG                g71556.jpg  GRAPHIC   17325
 4.0                    G812374.JPG               g812374.jpg  GRAPHIC   75115
 NaN  Complete submission text file  0001047469-19-001263.txt      NaN  790585]

In [56]:
var = pd.read_html(str(soup.find('table')).replace('<br>', '<br>\n'), header=0, index_col =0)[0]
var

Unnamed: 0_level_0,Description,Document,Type,Size
Seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,424B5,a2238051z424b5.htm,424B5,644742
2.0,G136386.JPG,g136386.jpg,GRAPHIC,11919
3.0,G71556.JPG,g71556.jpg,GRAPHIC,17325
4.0,G812374.JPG,g812374.jpg,GRAPHIC,75115
,Complete submission text file,0001047469-19-001263.txt,,790585


In [58]:
print('-'*80)

--------------------------------------------------------------------------------


In [11]:
def get_sec_table(url,
                  table_provider=None,
                  base_params={}, 
                  params_provider=None,
                  pagination_provider=None,
                  replace_links=True,
                  session=None):
    def return_data_frame(session, url, params, provider):
        request = session.get(url, params=params)
        soup = BeautifulSoup(request.text)
        if replace_links:
            for a in soup.find_all('a'):
                parent = a.find_parent('td')
                if parent: parent.string = a['href']
        df = provider(soup)
        return df, soup
    ####################################################################
    ###if no Session, then we use the base_url to do the pull request###
    ####################################################################
    if session is None:
        session = cach
    if not url.startswith('http://') and not url.startswith('https://'):
        url = base_url.format(url)
    ###############################################################################################    
    ###if the specified parameters are a dictionary, update params with the specified parameters###
    ###############################################################################################
    params = dict(base_params)
    if params_provider:
        if isinstance(params_provider, dict):
            params.update(params_provider)
        else:
            params.update(params_provider())
    ############################################################        
    ### what exactly is the purpose of a pagination provider?###
    ############################################################
    if not pagination_provider:
        df, soup = return_data_frame(session, url, params, table_provider)
        return df
    else:
        data_frames = []
        page_params = dict(params)
        while True:
            df, soup = return_data_frame(session, url, page_params, table_provider)
            data_frames.append(df)
            # Make sure columns retain their names
            data_frames[-1].columns = data_frames[0].columns
            new_params = pagination_provider(soup, page_params)
            if not new_params:
                break
            else:
                page_params.update(new_params)
        return pd.concat(data_frames, sort=False, ignore_index=True)

### Breaking down get_sec_table

#### def_return_dataframe

In [34]:
for a in soup.find_all('a'):
    print(a)

<a href="/index.htm">Home</a>
<a href="/cgi-bin/browse-edgar?action=getcurrent">Latest Filings</a>
<a href="javascript:history.back()">Previous Page</a>
<a href="/index.htm"><img alt="SEC Seal" border="0" src="/images/sealTop.gif"/></a>
<a href="/edgar/searchedgar/webusers.htm">Search the Next-Generation EDGAR System</a>
<a href="/index.htm">SEC Home</a>
<a href="/edgar/searchedgar/webusers.htm">Search the Next-Generation EDGAR System</a>
<a href="/edgar/searchedgar/companysearch.html">Company Search</a>
<a href="/cgi-bin/browse-edgar?CIK=0001035443&amp;action=getcompany">0001035443 (see all company filings)</a>
<a href="/cgi-bin/browse-edgar?filenum=333-222136&amp;action=getcompany"><strong>333-222136</strong></a>
<a href="/cgi-bin/browse-edgar?action=getcompany&amp;SIC=6798&amp;owner=include">6798</a>
<a href="/cgi-bin/browse-edgar?CIK=0001542693&amp;action=getcompany">0001542693 (see all company filings)</a>
<a href="/cgi-bin/browse-edgar?filenum=333-222136-01&amp;action=getcompany"

In [35]:
for a in soup.find_all('a'):
    parent = a.find_parent('td')
    print(parent)

None
None
None
None
None
None
None
None
None
None
None
None
None
None


#### params_provider

In [38]:
base_params = {}
params = dict(base_params)
params_provider = {'company': '', 'owner': 'exclude', 'action': 'getcompany'}

In [None]:
if params_provider is instance(params_provider, dict):
    params.update(params_provider)

## Function to get the documents

This is a function to get the documents in the filing details page for each filing. See below for an example page.


In [14]:
url = base_url.format('/Archives/edgar/data/1035443/0001047469-19-001263-index.html')
url

'https://www.sec.gov/Archives/edgar/data/1035443/0001047469-19-001263-index.html'

In [15]:
get_filing_documents = lambda url, summary = 'Document Format Files' : get_sec_table(url,
                                                                                    table_provider = table_provider_by_summary(summary, index_col=None),
                                                                                    pagination_provider = pagination_provider_by_element_start_count(('input',), {'value': 'Next 100'}))

## Scraping most recent filings

For the previous 5 days

In [12]:
base_url = 'https://www.sec.gov{}'

In [16]:
def get_current_events(days_before=0, form_type=''):
    soup = BeautifulSoup(cach.get(base_url.format('/cgi-bin/current'), 
                            params={'q1': days_before, 'q2': 0, 'q3': form_type}).text)
    pre = soup.find('pre')
    ls = []
    for line in str(pre).replace('<hr>', '\n').replace('<hr/>', '\n').split('\n'):
        bs_line = BeautifulSoup(line)
        clean_line = '  '.join(item.strip() for item in bs_line.find_all(text=True))
        split_line = [ x.strip() for x in clean_line.split('  ') if x.strip() ]
        split_line += [ a.get('href') for a in bs_line.find_all('a') ]
        if not all(x is None for x in split_line): ls.append(split_line)
    colnames = ls[0] + [ 'link_{}'.format(i) for i in range(max(len(l) for l in ls) - len(ls[0])) ]
    return pd.DataFrame(ls[1:], columns=colnames)

In [17]:
get_current_events(form_type='8-K').head()

Unnamed: 0,Date Filed,Form,CIK Code,Company Name,link_0,link_1
0,11-20-2020,8-K,1800,ABBOTT LABORATORIES,/Archives/edgar/data/1800/0001104659-20-127945...,browse-edgar?action=getcompany&CIK=1800
1,11-20-2020,8-K,1820191,AEA-Bridges Impact Corp.,/Archives/edgar/data/1820191/0001193125-20-299...,browse-edgar?action=getcompany&CIK=1820191
2,11-20-2020,8-K,868857,AECOM,/Archives/edgar/data/868857/0001104659-20-1279...,browse-edgar?action=getcompany&CIK=868857
3,11-20-2020,8-K,946644,AIM ImmunoTech Inc.,/Archives/edgar/data/946644/0001493152-20-0221...,browse-edgar?action=getcompany&CIK=946644
4,11-20-2020,8-K,926660,AIMCO PROPERTIES L.P.,/Archives/edgar/data/926660/0001193125-20-2987...,browse-edgar?action=getcompany&CIK=926660


#### Breaking down get_current_events

Questions: 
* What exactly is the goal of splitting every word in the list of rows? 


In [35]:
# this code goes to the current events page and scrapes the html of the list of documents
soup = BeautifulSoup(cach.get(base_url.format('/cgi-bin/current'), params = {'q1': 0, 'q2':0, 'q3': '8-K'}).text)
soup

<html>
<head>
<title>EDGAR Current Events</title>
<script language="JavaScript" src="/include/sec.js" type="text/javascript"></script>
</head>
<body bgcolor="#FFFFFF" link="#807331" vlink="#FF0000">
<!-- SEC Web Analytics - For information please visit: http://www.sec.gov/privacy.htm#collectedinfo -->
<noscript><iframe height="0" src="//www.googletagmanager.com/ns.html?id=GTM-TD3BKV" style="display:none;visibility:hidden" width="0"></iframe></noscript>
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
'//www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
})(window,document,'script','dataLayer','GTM-TD3BKV');</script>
<!-- End SEC Web Analytics -->
<br/>
<table width="100%"><tr><td width="5%"></td>
<td width="80%">
<h1>Results for EDGAR Current Events</h1>
<p>The total number of matches for 2020-11-20 is <

Since the code is super dirty and company names and dates are mixed in the same text, we have split them into new lines.

We are only interested in the table, so that's why we use soup.find('pre') --> look at the HTML code in the base_url. 

In [36]:
pre = soup.find('pre')
lines = str(pre).replace('<hr>', '\n').replace('<hr/>', '\n'). split('\n')
lines

['<pre><strong>Date Filed   Form        CIK Code     Company Name</strong>',
 '11-20-2020   <a href="/Archives/edgar/data/1800/0001104659-20-127945-index.html">8-K</a>              <a href="browse-edgar?action=getcompany&amp;CIK=1800">1800</a>   ABBOTT LABORATORIES',
 '11-20-2020   <a href="/Archives/edgar/data/1820191/0001193125-20-299260-index.html">8-K</a>           <a href="browse-edgar?action=getcompany&amp;CIK=1820191">1820191</a>   AEA-Bridges Impact Corp.',
 '11-20-2020   <a href="/Archives/edgar/data/868857/0001104659-20-127983-index.html">8-K</a>            <a href="browse-edgar?action=getcompany&amp;CIK=868857">868857</a>   AECOM',
 '11-20-2020   <a href="/Archives/edgar/data/946644/0001493152-20-022140-index.html">8-K</a>            <a href="browse-edgar?action=getcompany&amp;CIK=946644">946644</a>   AIM ImmunoTech Inc.',
 '11-20-2020   <a href="/Archives/edgar/data/926660/0001193125-20-298746-index.html">8-K</a>            <a href="browse-edgar?action=getcompany&amp;CIK=92

In [38]:
## here instead of having each row of the table between single HTML tags, we make each row separate
for line in lines:
    bs_line = BeautifulSoup(line)
    print(bs_line)

<html><body><pre><strong>Date Filed   Form        CIK Code     Company Name</strong></pre></body></html>
<html><body><p>11-20-2020   <a href="/Archives/edgar/data/1800/0001104659-20-127945-index.html">8-K</a> <a href="browse-edgar?action=getcompany&amp;CIK=1800">1800</a>   ABBOTT LABORATORIES</p></body></html>
<html><body><p>11-20-2020   <a href="/Archives/edgar/data/1820191/0001193125-20-299260-index.html">8-K</a> <a href="browse-edgar?action=getcompany&amp;CIK=1820191">1820191</a>   AEA-Bridges Impact Corp.</p></body></html>
<html><body><p>11-20-2020   <a href="/Archives/edgar/data/868857/0001104659-20-127983-index.html">8-K</a> <a href="browse-edgar?action=getcompany&amp;CIK=868857">868857</a>   AECOM</p></body></html>
<html><body><p>11-20-2020   <a href="/Archives/edgar/data/946644/0001493152-20-022140-index.html">8-K</a> <a href="browse-edgar?action=getcompany&amp;CIK=946644">946644</a>   AIM ImmunoTech Inc.</p></body></html>
<html><body><p>11-20-2020   <a href="/Archives/edgar/da

In [41]:
for line in lines:
    bs_line = BeautifulSoup(line)
    clean_line = '  '.join(item.strip() for item in bs_line.find_all(text=True))
    print(clean_line)

Date Filed   Form        CIK Code     Company Name
11-20-2020  8-K    1800  ABBOTT LABORATORIES
11-20-2020  8-K    1820191  AEA-Bridges Impact Corp.
11-20-2020  8-K    868857  AECOM
11-20-2020  8-K    946644  AIM ImmunoTech Inc.
11-20-2020  8-K    926660  AIMCO PROPERTIES L.P.
11-20-2020  8-K    1514991  AMC Networks Inc.
11-20-2020  8-K    4904  AMERICAN ELECTRIC POWER CO INC
11-20-2020  8-K    1039828  AMERICAN EQUITY INVESTMENT LIFE HOLDING CO
11-20-2020  8-K    1053507  AMERICAN TOWER CORP /MA/
11-20-2020  8-K/A    1365916  AMYRIS, INC.
11-20-2020  8-K    922864  APARTMENT INVESTMENT & MANAGEMENT CO
11-20-2020  8-K    7431  ARMSTRONG WORLD INDUSTRIES INC
11-20-2020  8-K    1621221  ARTELO BIOSCIENCES, INC.
11-20-2020  8-K    1232582  ASHFORD HOSPITALITY TRUST INC
11-20-2020  8-K    866787  AUTOZONE INC
11-20-2020  8-K    1681087  AVROBIO, Inc.
11-20-2020  8-K    1388320  Actinium Pharmaceuticals, Inc.
11-20-2020  8-K    1621227  Adaptimmune Therapeutics PLC
11-20-2020  8-K    12239

In [43]:
## the split_line for some reason separates every letter
for line in lines:
    bs_line = BeautifulSoup(line)
    clean_line = '  '.join(item.strip() for item in bs_line.find_all(text=True))
    split_line = [x.strip() for x in clean_line.strip('  ') if x.strip()]
    print(split_line)

['D', 'a', 't', 'e', 'F', 'i', 'l', 'e', 'd', 'F', 'o', 'r', 'm', 'C', 'I', 'K', 'C', 'o', 'd', 'e', 'C', 'o', 'm', 'p', 'a', 'n', 'y', 'N', 'a', 'm', 'e']
['1', '1', '-', '2', '0', '-', '2', '0', '2', '0', '8', '-', 'K', '1', '8', '0', '0', 'A', 'B', 'B', 'O', 'T', 'T', 'L', 'A', 'B', 'O', 'R', 'A', 'T', 'O', 'R', 'I', 'E', 'S']
['1', '1', '-', '2', '0', '-', '2', '0', '2', '0', '8', '-', 'K', '1', '8', '2', '0', '1', '9', '1', 'A', 'E', 'A', '-', 'B', 'r', 'i', 'd', 'g', 'e', 's', 'I', 'm', 'p', 'a', 'c', 't', 'C', 'o', 'r', 'p', '.']
['1', '1', '-', '2', '0', '-', '2', '0', '2', '0', '8', '-', 'K', '8', '6', '8', '8', '5', '7', 'A', 'E', 'C', 'O', 'M']
['1', '1', '-', '2', '0', '-', '2', '0', '2', '0', '8', '-', 'K', '9', '4', '6', '6', '4', '4', 'A', 'I', 'M', 'I', 'm', 'm', 'u', 'n', 'o', 'T', 'e', 'c', 'h', 'I', 'n', 'c', '.']
['1', '1', '-', '2', '0', '-', '2', '0', '2', '0', '8', '-', 'K', '9', '2', '6', '6', '6', '0', 'A', 'I', 'M', 'C', 'O', 'P', 'R', 'O', 'P', 'E', 'R', 'T',

In [44]:
## we then add the links for the forms into each list if they have a link in bs_line
for line in lines:
    bs_line = BeautifulSoup(line)
    clean_line = '  '.join(item.strip() for item in bs_line.find_all(text=True))
    split_line = [x.strip() for x in clean_line.strip('  ') if x.strip()]
    split_line += [a.get('href') for a in bs_line.find_all('a')]
    print(split_line)

['D', 'a', 't', 'e', 'F', 'i', 'l', 'e', 'd', 'F', 'o', 'r', 'm', 'C', 'I', 'K', 'C', 'o', 'd', 'e', 'C', 'o', 'm', 'p', 'a', 'n', 'y', 'N', 'a', 'm', 'e']
['1', '1', '-', '2', '0', '-', '2', '0', '2', '0', '8', '-', 'K', '1', '8', '0', '0', 'A', 'B', 'B', 'O', 'T', 'T', 'L', 'A', 'B', 'O', 'R', 'A', 'T', 'O', 'R', 'I', 'E', 'S', '/Archives/edgar/data/1800/0001104659-20-127945-index.html', 'browse-edgar?action=getcompany&CIK=1800']
['1', '1', '-', '2', '0', '-', '2', '0', '2', '0', '8', '-', 'K', '1', '8', '2', '0', '1', '9', '1', 'A', 'E', 'A', '-', 'B', 'r', 'i', 'd', 'g', 'e', 's', 'I', 'm', 'p', 'a', 'c', 't', 'C', 'o', 'r', 'p', '.', '/Archives/edgar/data/1820191/0001193125-20-299260-index.html', 'browse-edgar?action=getcompany&CIK=1820191']
['1', '1', '-', '2', '0', '-', '2', '0', '2', '0', '8', '-', 'K', '8', '6', '8', '8', '5', '7', 'A', 'E', 'C', 'O', 'M', '/Archives/edgar/data/868857/0001104659-20-127983-index.html', 'browse-edgar?action=getcompany&CIK=868857']
['1', '1', '-'

In [46]:
ls=[]
for line in lines:
    bs_line = BeautifulSoup(line)
    clean_line = '  '.join(item.strip() for item in bs_line.find_all(text=True))
    split_line = [x.strip() for x in clean_line.strip('  ') if x.strip()]
    split_line += [a.get('href') for a in bs_line.find_all('a')]
    if not all(x is None for x in split_line): ls.append(split_line)
ls

[['D',
  'a',
  't',
  'e',
  'F',
  'i',
  'l',
  'e',
  'd',
  'F',
  'o',
  'r',
  'm',
  'C',
  'I',
  'K',
  'C',
  'o',
  'd',
  'e',
  'C',
  'o',
  'm',
  'p',
  'a',
  'n',
  'y',
  'N',
  'a',
  'm',
  'e'],
 ['1',
  '1',
  '-',
  '2',
  '0',
  '-',
  '2',
  '0',
  '2',
  '0',
  '8',
  '-',
  'K',
  '1',
  '8',
  '0',
  '0',
  'A',
  'B',
  'B',
  'O',
  'T',
  'T',
  'L',
  'A',
  'B',
  'O',
  'R',
  'A',
  'T',
  'O',
  'R',
  'I',
  'E',
  'S',
  '/Archives/edgar/data/1800/0001104659-20-127945-index.html',
  'browse-edgar?action=getcompany&CIK=1800'],
 ['1',
  '1',
  '-',
  '2',
  '0',
  '-',
  '2',
  '0',
  '2',
  '0',
  '8',
  '-',
  'K',
  '1',
  '8',
  '2',
  '0',
  '1',
  '9',
  '1',
  'A',
  'E',
  'A',
  '-',
  'B',
  'r',
  'i',
  'd',
  'g',
  'e',
  's',
  'I',
  'm',
  'p',
  'a',
  'c',
  't',
  'C',
  'o',
  'r',
  'p',
  '.',
  '/Archives/edgar/data/1820191/0001193125-20-299260-index.html',
  'browse-edgar?action=getcompany&CIK=1820191'],
 ['1',
  '1',
  '-'

## Downloading SEC documents

Questions: 
* What is the purpose of defining a directory? It does not seem to work when I use it as a parameter for download_sec_documents

* What does the error "index 0 is out of bounds for axis 0 with size 0" mean? I still manage to download the files.

In [28]:
def download_sec_documents(doc_link):
    contents = clean_sec_content(cach.get(base_url.format(doc_link)).content)
    name = slugify(doc_link)
    with open(name, 'w') as f: f.write(contents)

#### Downloading 424B5s

In [29]:
forms = get_current_events(0, '424B5')
for link in forms['link_0']:
    docs = get_filing_documents(base_url.format(link))
    doc_link = docs.loc[docs.Type == '424B5', 'Document'].values[0]
    download_sec_documents(doc_link)

#### Downloading 8-Ks

In [84]:
num_days = 1

for p in range(0, num_days):
    print('Scraping day-page:', p)
    forms = get_current_events(p, '8-K')
    for link in forms['link_0']:
        docs = get_filing_documents(base_url.format(link))
        doc_link = docs.loc[docs.Type == '8-K', 'Document'].values[0]
        download_sec_documents(doc_link)

Scraping day-page: 0


IndexError: index 0 is out of bounds for axis 0 with size 0

To download the 8-K I always get the index error above, however when I try to download 424B5 filings this is not a issue. This happens in the filing details page: 

8-K example: https://www.sec.gov/Archives/edgar/data/926660/0001193125-20-298746-index.html

424B5 example: https://www.sec.gov/Archives/edgar/data/1035443/0001047469-19-001263-index.html


In [30]:
content = read_html('-archives-edgar-data-1629210-000156459020054792-pzg-424b5.htm')
cleaned = clean_html(content)

In [31]:
window(str(cleaned))

## Summary extraction

There is a difference between 424B5 and 8-K forms. The 424B5 forms have summary tables that are what the extract_dual_tables function extracts. 8-K forms are entirely text. We need to find a way to select and extract relevant info from the 8-Ks

In [21]:
def extract_dual_tables(soup):
    dualrows = []
    for tr in soup.select("table tr"):
        row = [td.text.strip() for td in tr.find_all('td')]
        if len(row) != 2:
            continue
        if row[1].strip() == '':
            continue
        if all([row[x] == '' for x in range(0, len(row)-1)]):
            if len(dualrows) > 0 and len(row) == len(dualrows[-1]):
                dualrows[-1][-1] += ' ' + row[-1]
        else:
            dualrows.append(row)
    return dualrows

In [22]:
from IPython.display import HTML

def window(html):
    s = '<script type="text/javascript">'
    s += 'var win = window.open("", "", "toolbar=no, location=no, directories=no, status=no, menubar=no, scrollbars=yes, resizable=yes, width=780, height=200, top="+(screen.height-400)+", left="+(screen.width-840));'
    s += 'win.document.body.innerHTML = \'' + html.replace("\n",'\\n').replace("'", "\\'") + '\';'
    s += '</script>'
    return HTML(s)

In [23]:
cleaned.find('p', text=re.compile(r'OFF'))

In [24]:
def match_by_name_and_regex(name, regex, lowercase=True):
    return lambda el: el.name == name and re.search(regex, el.text.lower() if lowercase else el.text) is not None

In [32]:
def get_offering_header_candidates(soup):
    return soup.find_all(match_by_name_and_regex('p', r'\s*offering\s*$'))

def get_after_offering_header_tables(header):
    tables = ''
    nextSibling = header.nextSibling
    table_seen = False
    while True:
        if nextSibling is None:
            break
        if type(nextSibling) == NavigableString:   
            if table_seen and str(nextSibling).strip() != '': break
            nextSibling = nextSibling.nextSibling
            continue
        if nextSibling.name != 'table':
            if table_seen and nextSibling.get_text(strip=True) != '': break
            nextSibling = nextSibling.nextSibling
            continue
        table_seen = True
        tables += str(nextSibling)
        if not nextSibling.nextSibling:
            print(nextSibling)
            print(nextSibling.nextSibling)
        nextSibling = nextSibling.nextSibling
    return tables

def extract_offering(soup):
    for header in get_offering_header_candidates(soup):
        tables = get_after_offering_header_tables(header)
        if tables:
            return extract_dual_tables(BeautifulSoup(tables))

        
extract_offering(cleaned)

[['•',
  '•our Annual Report on Form 10-K for the year ended June 30, 2020 filed with the SEC on September 25, 2020;'],
 ['•',
  '•our Quarterly Report on Form 10-Q for the quarter ended September 30, 2020, filed with the SEC on November 05, 2020;'],
 ['•', '•our Current Report on Form 8-K filed on November 20, 2020;'],
 ['•',
  'the portions of our definitive proxy statement on Schedule 14A filed with the SEC on October 27, 2020 that are deemed “filed” with the SEC under the Exchange Act;'],
 ['•',
  'the description of our common stock contained in our Registration Statement on Form 8-A filed on April 6, 2015, as updated by the description of our common stock filed as Exhibit 4.1 to our Annual Report on Form 10-K for the fiscal year ended June 30, 2020 filed with the SEC on September 25, 2020, including any amendments or reports filed for the purpose of updating such description; and'],
 ['•',
  'all reports and other documents subsequently filed by us pursuant to Sections 13(a), 13(

In [26]:
window(str(cleaned))