# Imports & Init

In [6]:
import pandas as pd

from pyedgar import config
from pyedgar import EDGARIndex

In [3]:
idx = EDGARIndex()

# Download Index File

In [11]:
config.CACHE_INDEX

True

In [12]:
config.INDEX_CACHE_ROOT

'C:\\Users\\gaulinmp/Dropbox/Documents/School/_data/pyedgar/indices/source/'

In [13]:
config.INDEX_ROOT

'C:\\Users\\gaulinmp/Dropbox/Documents/School/_data/pyedgar/indices/'

In [14]:
# Run this to download all indexes (which files to keep is defined in the config)
EDGARIndex(force_download=True)

Downloading the quarterly indices...
108it [06:58,  3.87s/it]
Done downloading, extracting...
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [05:34<00:00, 55.82s/it]
Done!


<pyedgar.index.EDGARIndex at 0x1d48a388ca0>

# Use Index File

In [15]:
# Look at the indices available. Prior to downloading, this will be empty.
idx.indices

{'form_10-K.tab.gz': 'C:\\Users\\gaulinmp/Dropbox/Documents/School/_data/pyedgar/indices/form_10-K.tab.gz',
 'form_10-Q.tab.gz': 'C:\\Users\\gaulinmp/Dropbox/Documents/School/_data/pyedgar/indices/form_10-Q.tab.gz',
 'form_10.periods.tab.gz': 'C:\\Users\\gaulinmp/Dropbox/Documents/School/_data/pyedgar/indices/form_10.periods.tab.gz',
 'form_13s.tab.gz': 'C:\\Users\\gaulinmp/Dropbox/Documents/School/_data/pyedgar/indices/form_13s.tab.gz',
 'form_8-K.tab.gz': 'C:\\Users\\gaulinmp/Dropbox/Documents/School/_data/pyedgar/indices/form_8-K.tab.gz',
 'form_all.tab.gz': 'C:\\Users\\gaulinmp/Dropbox/Documents/School/_data/pyedgar/indices/form_all.tab.gz',
 'form_DEF14A.tab.gz': 'C:\\Users\\gaulinmp/Dropbox/Documents/School/_data/pyedgar/indices/form_DEF14A.tab.gz'}

In [16]:
all_index = idx['all']

In [18]:
all_index.form.value_counts().head(20)

4           7627989
8-K         1702084
SC 13G/A     739963
3            693082
10-Q         605468
497          455165
6-K          430719
SC 13G       401678
424B2        376140
424B3        291222
13F-HR       286966
D            276334
SC 13D/A     239206
4/A          235424
497K         220107
5            214422
CORRESP      208141
10-K         199316
UPLOAD       198937
D/A          186400
Name: form, dtype: int64

In [20]:
# Let repeat the above, but just for S-1 matching forms
all_index[all_index.form.str.contains('S-1')].form.value_counts()

S-1/A        58586
S-1          24803
ABS-15G      13460
ABS-15G/A     3676
S-11/A        3555
S-1MEF        1981
S-11          1158
S-11MEF         83
Name: form, dtype: int64

In [24]:
s1_index = all_index.query('form == "S-1"')
# Alternatively:
# s1_index = EDGARIndex()['all'].query('form == "S-1"')
s1_index.head()

Unnamed: 0,cik,name,form,filedate,accession
6022,1961,WORLDS INC,S-1,2014-02-04,0001264931-14-000033
6063,1961,WORLDS INC,S-1,2016-08-23,0001264931-16-000397
6080,1961,WORLDS INC,S-1,2018-05-01,0001264931-18-000036
11220,2186,RELM WIRELESS CORP,S-1,2000-06-07,0000950115-00-000797
11241,2186,RELM WIRELESS CORP,S-1,2001-12-19,0001021408-01-511568


In [25]:
s1_with_ammendments = all_index[all_index.form.str.contains('^S-1(/A)?$')]
s1_with_ammendments.head()

  return func(self, *args, **kwargs)


Unnamed: 0,cik,name,form,filedate,accession
6022,1961,WORLDS INC,S-1,2014-02-04,0001264931-14-000033
6063,1961,WORLDS INC,S-1,2016-08-23,0001264931-16-000397
6080,1961,WORLDS INC,S-1,2018-05-01,0001264931-18-000036
6084,1961,WORLDS INC,S-1/A,2018-06-08,0001264931-18-000065
7503,2036,ACF INDUSTRIES INC,S-1/A,1996-05-09,0000921749-96-000055


# Example loop over extracts

In [27]:
from pyedgar.filing import Filing

In [29]:
results = []
for row in s1_index.itertuples():
    filing = Filing(cik=row.cik, accession=row.accession)
    
    # Make an object to store this loops calculated values
    i_res = {'cik':row.cik, 'accession':row.accession}
    # By immediately adding it to the results, any errors thrown below
    # will just not add those variables, so you can effectively see how
    # far for which cik/accession your algorithm got
    results.append(i_res)
    
    try:
        # I suggest wrapping each variable you want to add in a try/catch like this
        i_res['doc_len'] = len(filing.full_text)
    except Exception as e:
        # yes, catch all exceptions are bad, but we don't want our
        # loop dying 4 hours in at iteration 100,000.
        i_res['error'] = e
    
    break # remove this break to process all S-1s

In [30]:
results

[{'cik': 1961, 'accession': '0001264931-14-000033', 'doc_len': 4248391}]

In [32]:
pd.DataFrame(results)

Unnamed: 0,cik,accession,doc_len
0,1961,0001264931-14-000033,4248391
