In [1]:
import requests
from bs4 import BeautifulSoup

# Section One: Define the Parameters of the Search
To create a search we need to "build" a URL that takes us to a valid results query, this requires taking our base endpoint and attaching on different parameters to help narrow down our search. I'll do my best to explain how each of these parameters works, but unfortunately, there is no formal documentation on this.

Endpoint The endpoint for our EDGAR query is https://www.sec.gov/cgi-bin/browse-edgar if you go to this link without any additional parameters it will be an invalid request.

--------------------------------------------------------------------
### Parameters:

- **action:** (required) By default should be set to getcompany.

- **CIK**: (required) Is the CIK number of the company you are searching.

- **type**: (optional) Allows filtering the type of form. For example, if set to 10-k only the 10-K filings are returned.

- **dateb**: (optional) Will only return the filings before a given date. The format is as follows YYYYMMDD

- **owner:** (required) Is set to exclude by default and specifies ownership. You may also set it to include and only.

- **start:** (optional) Is the starting index of the results. For example, if I have 100 results but want to start at 45 of 100, I would pass 45.

- **state:** (optional) The company's state.

- **filenum:** (optional) The filing number.

- **sic:** (optional) The company's SIC (Standard Industry Classification) identifier
- **output:** (optional) Defines returned data structure as either xml (atom) or normal html.

- **count:** (optional) The number of results you want to see with your request, the max is 100 and if not set it will default to 40.

------------------------------------------------------------------------------
Now that we understand all the parameters let's make a request by defining our endpoint, and then a dictionary of our parameters. Where the key of the dictionary is the parameter name, and the value is the value we want to set for that parameter. Once we've defined these two components we can make our request and parse the response using BeautifulSoup.

In [17]:
# base URL for the SEC EDGAR browser
endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"

# define our parameters dictionary
param_dict = {'action':'getcompany',
              'owner':'exclude',
              'output':'atom',
              'company':'Microsoft'}

# request the url, and then parse the response.
response = requests.get(url = endpoint, params = param_dict)
soup = BeautifulSoup(response.content, 'lxml')

# print status code
print(response.status_code)
print(response.url)

200
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&owner=exclude&output=atom&company=Microsoft


In [27]:
# base URL for the SEC EDGAR browser
endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"

# define our parameters dictionary
param_dict = {'action':'getcompany',
              'CIK':' 0000789019',
              'type':'10-k',
              'dateb':'',
              'owner':'exclude',
              'start':'',
              'output':'atom',
              'count':'100'}

# request the url, and then parse the response.
response = requests.get(url = endpoint, params = param_dict)
soup = BeautifulSoup(response.content, 'lxml')

# print status code
print(response.status_code)
print(response.url)

200
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=+0000789019&type=10-k&dateb=&owner=exclude&start=&output=atom&count=100


In [35]:
entries = soup.find_all('entry')
# print(entries)

# initalize our list for storage
master_list_xml = []

# loop through each found entry, remember this is only the first two
for entry in entries:
    # grab the accession number so we can create a key value
    accession_num = entry.find('accession-number').text
    
    # create a new dictionary
    entry_dict = {}
    entry_dict[accession_num] = {}
    
    # store the category info
    category_info = entry.find('category')    
    entry_dict[accession_num]['category'] = {}
    entry_dict[accession_num]['category']['label'] = category_info['label']
    entry_dict[accession_num]['category']['scheme'] = category_info['scheme']
    entry_dict[accession_num]['category']['term'] =  category_info['term']

    # store the file info
    entry_dict[accession_num]['file_info'] = {}

    try:
        entry_dict[accession_num]['file_info']['act'] = entry.find('act').text
    except:
        entry_dict[accession_num]['file_info']['act'] = ""

    entry_dict[accession_num]['file_info']['file_number'] = entry.find('file-number').text
    entry_dict[accession_num]['file_info']['file_number_href'] = entry.find('file-number-href').text
    entry_dict[accession_num]['file_info']['filing_date'] =  entry.find('filing-date').text
    entry_dict[accession_num]['file_info']['filing_href'] = entry.find('filing-href').text
    entry_dict[accession_num]['file_info']['filing_type'] =  entry.find('filing-type').text
    entry_dict[accession_num]['file_info']['form_number'] =  entry.find('film-number').text
    entry_dict[accession_num]['file_info']['form_name'] =  entry.find('form-name').text
    entry_dict[accession_num]['file_info']['file_size'] =  entry.find('size').text
    
    try:
        entry_dict[accession_num]['file_info']['xbrl_href'] = entry.find('xbrl_href').text
    except:
        entry_dict[accession_num]['file_info']['xbrl_href'] = ""
    
    # store extra info
    entry_dict[accession_num]['request_info'] = {}
    entry_dict[accession_num]['request_info']['link'] =  entry.find('link')['href']
    entry_dict[accession_num]['request_info']['title'] =  entry.find('title').text
    entry_dict[accession_num]['request_info']['last_updated'] =  entry.find('updated').text
    
    # store in the master list
    master_list_xml.append(entry_dict)

In [39]:
master_list_xml[0]["0001564590-19-027952"]["file_info"]

{'act': '34',
 'file_number': '001-37845',
 'file_number_href': 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&filenum=001-37845&owner=exclude&count=100',
 'filing_date': '2019-08-01',
 'filing_href': 'https://www.sec.gov/Archives/edgar/data/789019/000156459019027952/0001564590-19-027952-index.htm',
 'filing_type': '10-K',
 'form_number': '19992755',
 'form_name': 'Annual report [Section 13 and 15(d), not S-K Item 405]',
 'file_size': '26 MB',
 'xbrl_href': 'https://www.sec.gov/cgi-bin/viewer?action=view&cik=789019&accession_number=0001564590-19-027952&xbrl_type=v'}