# Creating an Archive of Company Filings from EDGAR

I decided that I wanted to explore NLP over the prospectuses of convertible bonds.  
These files are submitted to the SEC and made available via the EDGAR database.  EDGAR has company form filings for IPOs, quarterly earnings, bond issuances, and more.  But access to EDGAR is fairly manual.  This notebook shows how to use Python to:

1. Create an index of all filings (for all form types)
1. Quickly download the history of the convertible bond prospectuses

In [1]:
import datetime
import os
import pandas as pd
import random
import re
import requests
import string
import threading
import time

from bs4 import BeautifulSoup
from io import StringIO

COMPANY_REGEX = re.compile(r'^company\.idx$')
QUARTER_REGEX = re.compile(r'^QTR[1-4]$')
YEAR_REGEX = re.compile(r'^\d{4}$')
EDGAR_BASE_URL = "https://www.sec.gov/Archives"  # all .idx urls are relative to this
EDGAR_INDEX_URL = "https://www.sec.gov/Archives/edgar/full-index/"

DOWNLOAD_PATH = "/media/mallinger/DATA/edgar"
CONVERT_DOWNLOAD_PATH = os.path.join(DOWNLOAD_PATH, "convertible-bonds")



## Scrape the Index Locations

The EDGAR indexes are behind two layers of web pages.  The first (https://www.sec.gov/Archives/edgar/full-index/) provides a table of contents for each year.  The second (1997 example: https://www.sec.gov/Archives/edgar/full-index//1997/) provides a table of contents for the quarters in each year.

Inside the quarter directories are many files.  But the .idx files are all the same, simply ordered by different criteria.  So we'll take "company.idx" to get the index for each quarter.

In [None]:
def parse_index_page(url, link_re, base_url=None):
    """Given a URL to a contents page, returns all the links that match
    the provided regular expression.  Helpful for scraping table of
    contents pages.
    """
    index = requests.get(url)
    soup = BeautifulSoup(index.content, 'html.parser')
    # all content links on the EDGAR pages are inside the main area
    # designated by this HTML id attribute.
    content_links = soup.find(id="main-content").find_all('a')
    urls = []
    for link in content_links:
        if link_re.match(link.getText()):
            href = link.get("href")
            if base_url:
                href = "{}/{}".format(base_url, href)
            urls.append(href)
    return urls

def parse_all_index_pages(urls, link_re):
    """Given a list of URLs to a contents pages, returns a list of
    all the links found that match the link regular expression.
    """
    found_urls = []
    for url in urls:
        found_urls.extend(parse_index_page(url, link_re, base_url=url))
    return found_urls

# get all the URLs to year partitions
year_urls = parse_index_page(EDGAR_INDEX_URL, YEAR_REGEX, base_url=EDGAR_INDEX_URL)
# get all the URLs to the quarter partitions
quarter_urls = parse_all_index_pages(year_urls, QUARTER_REGEX)
# get the company.idx file in the quarterly folder
index_urls = parse_all_index_pages(quarter_urls, COMPANY_REGEX)

## Create the Index

`index_urls` has a URL for every .idx file.  This code iterates over each and parses the index file into a Pandas dataframe.  `index` is the end result of that.

In [None]:
def parse_index_file(url):
    """Given a URL to an .idx Edgar file, reads the index and returns
    a Pandas dataframe of the results.
    """
    # note: work with bytes strings here as the offsets for fixed width columns
    # get thrown off if there are conversion errors or bad characters (There are!)
    rows = requests.get(url).content.split(b"\n")
    # there is a header consisting of leading rows with comments followed by
    # a row of hyphens to start the data (------)
    row_offset = 1 + min([i for i, row in enumerate(rows) if re.match(b"------", row)])
    rows = rows[row_offset:]
    # skip any empty rows
    rows = list(filter(lambda row: not re.match(b"^\s*$", row), rows))
        
    # note, we can't use pd.read_fwf because of invalid characters in the bytes records.
    # this approach avoids lost records
    array_frame = [(row[0:62].strip().decode("utf-8", "ignore"),
                    row[62:74].strip().decode("utf-8", "ignore"),
                    row[74:86].strip().decode("utf-8", "ignore"),
                    row[86:98].strip().decode("utf-8", "ignore"),
                    row[98:].strip().decode("utf-8", "ignore")) 
                   for row in rows]
    df = pd.DataFrame(array_frame, columns=["name", "form", "cik", "date", "url"])
    df["date"] = pd.to_datetime(df["date"])
    return df

index = None
for url in index_urls:
    df = parse_index_file(url)
    if index is None:
        index = df
    else:
        index = pd.concat([index, df], ignore_index=True)


## Save the Index

Note that this code is extremely memory intensive.  When Pandas creates the pickle file, it consumes as much as 12gb of memory on my machine.  If you don't have over 10gb of memory free, you may want to edit it into batches.

In [None]:
index.to_pickle(os.path.join(DOWNLOAD_PATH, "edgar_index.pickle.gzip"), compression="gzip")

## Load the Index

(This cell for ease of starting with saved data)

In [None]:
index = pd.read_pickle(os.path.join(DOWNLOAD_PATH, "edgar_index.pickle.gzip"), compression="gzip")

## Downloading Forms

Once we have the index, downloading forms is very easy.  All form URLs in the the index are relative to the EDGAR archive URL.  We simply follow each and save it as is.

In [None]:
def download_records(records, path, file_prefix):
    counter = 0
    for _, record in records.iterrows():
        url = "{}/{}".format(EDGAR_BASE_URL, record["url"])
        content = requests.get(url).content.decode("utf-8", "ignore")
        # add a random padding to avoid filename colisions
        padding = ''.join([random.choice(string.ascii_letters + string.digits) for i in range(5)])
        filename = "{prefix}-{cik}-{date}-{padding}.txt".format(
                        prefix=file_prefix,
                        cik=record["cik"],
                        date=record["date"].strftime("%Y%m%d"),
                        padding=padding)

        with open(os.path.join(path, filename), "w") as fh:
            fh.write(content)

        # sleep for a second every 10 records
        counter += 1
        if counter % 10 == 0:
            time.sleep(1)

# create a dataset to download
is_form_424b2 = index["form"].str.contains("424B2")
records = index[is_form_424b2]

# we'll run this in batches of threads
# NOTE: We don't run "join", i.e. the threads will stay in the background
threads = []
num_threads = 5  # configuration here
batch_size = len(records) // num_threads
offset = 0

while offset < len(records):
    end = offset + batch_size
    t = threading.Thread(target=download_records, args=(records[offset:end], CONVERT_DOWNLOAD_PATH, "424b"))
    threads.append(t)
    t.start()
    offset = end

## Save Corpus as DataFrame

The full corpus of convertible prospectuses from 1993 onward is too large for our memory.  So we'll limit to more recent years (and relevant data).  Currently, our dataset is just the first two months of 2019.  This is helpful to play and see the full contents of each document but future iterations would then reduce size with parsing.

In [None]:
name_index = index[["cik", "name"]].drop_duplicates()
name_hash = name_index.set_index('cik').to_dict()['name']

counter = 0
records = []
for filename in os.listdir(CONVERT_DOWNLOAD_PATH):
    _, cik, date, _ = filename.split("-")
    parsed_date = datetime.datetime.strptime(date, "%Y%m%d")
    if parsed_date < datetime.datetime(2019, 1, 1) or \
            parsed_date >= datetime.datetime(2019, 3, 1):
        continue

    name = name_hash[cik]
    path = os.path.join(CONVERT_DOWNLOAD_PATH, filename)
    with open(path) as fh:
        content = fh.read()
    record = (cik, name, parsed_date, filename, content)
    records.append(record)
    
    if counter % 1000 == 0:
        print("Counter: {}".format(counter))
    counter += 1
        
df = pd.DataFrame(records, columns=("cik", "name", "date", "filename", "document"))
df.to_pickle(os.path.join(DOWNLOAD_PATH, "424b_201901-201902.pickle.gzip"), compression="gzip")

## Confirm Data Loading

In [7]:
df = pd.read_pickle(os.path.join(DOWNLOAD_PATH, "424b_201901-201902.pickle.gzip"), compression="gzip")
print(len(df))
df.loc[0, "document"]

7039


'<SEC-DOCUMENT>0001140361-19-002482.txt : 20190205\n<SEC-HEADER>0001140361-19-002482.hdr.sgml : 20190205\n<ACCEPTANCE-DATETIME>20190205161722\nACCESSION NUMBER:\t\t0001140361-19-002482\nCONFORMED SUBMISSION TYPE:\t424B2\nPUBLIC DOCUMENT COUNT:\t\t1\nFILED AS OF DATE:\t\t20190205\nDATE AS OF CHANGE:\t\t20190205\n\nFILER:\n\n\tCOMPANY DATA:\t\n\t\tCOMPANY CONFORMED NAME:\t\t\tESSEX PROPERTY TRUST, INC.\n\t\tCENTRAL INDEX KEY:\t\t\t0000920522\n\t\tSTANDARD INDUSTRIAL CLASSIFICATION:\tREAL ESTATE INVESTMENT TRUSTS [6798]\n\t\tIRS NUMBER:\t\t\t\t770369576\n\t\tSTATE OF INCORPORATION:\t\t\tMD\n\t\tFISCAL YEAR END:\t\t\t1231\n\n\tFILING VALUES:\n\t\tFORM TYPE:\t\t424B2\n\t\tSEC ACT:\t\t1933 Act\n\t\tSEC FILE NUMBER:\t333-227600\n\t\tFILM NUMBER:\t\t19568133\n\n\tBUSINESS ADDRESS:\t\n\t\tSTREET 1:\t\t1100 PARK PLACE\n\t\tSTREET 2:\t\tSUITE 200\n\t\tCITY:\t\t\tSAN MATEO\n\t\tSTATE:\t\t\tCA\n\t\tZIP:\t\t\t94403\n\t\tBUSINESS PHONE:\t\t6506557800\n\n\tMAIL ADDRESS:\t\n\t\tSTREET 1:\t\t1100 PARK P