In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Section One: Define the Parameters of the Search
To create a search we need to "build" a URL that takes us to a valid results query, this requires taking our base endpoint and attaching on different parameters to help narrow down our search. I'll do my best to explain how each of these parameters works, but unfortunately, there is no formal documentation on this.

Endpoint The endpoint for our EDGAR query is https://www.sec.gov/cgi-bin/browse-edgar if you go to this link without any additional parameters it will be an invalid request.

--------------------------------------------------------------------
### Parameters:

- **action:** (required) By default should be set to getcompany.

- **CIK**: (required) Is the CIK number of the company you are searching.

- **type**: (optional) Allows filtering the type of form. For example, if set to 10-k only the 10-K filings are returned.

- **dateb**: (optional) Will only return the filings before a given date. The format is as follows YYYYMMDD

- **owner:** (required) Is set to exclude by default and specifies ownership. You may also set it to include and only.

- **start:** (optional) Is the starting index of the results. For example, if I have 100 results but want to start at 45 of 100, I would pass 45.

- **state:** (optional) The company's state.

- **filenum:** (optional) The filing number.

- **sic:** (optional) The company's SIC (Standard Industry Classification) identifier
- **output:** (optional) Defines returned data structure as either xml (atom) or normal html.

- **count:** (optional) The number of results you want to see with your request, the max is 100 and if not set it will default to 40.

------------------------------------------------------------------------------
Now that we understand all the parameters let's make a request by defining our endpoint, and then a dictionary of our parameters. Where the key of the dictionary is the parameter name, and the value is the value we want to set for that parameter. Once we've defined these two components we can make our request and parse the response using BeautifulSoup.

In [37]:
# base URL for the SEC EDGAR browser
endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"

# define our parameters dictionary
param_dict = {'action':'getcompany',
              'CIK':'0000789019',
              'type':'10-Q',
              'dateb':'',
              'owner':'exclude',
              'start':'',
              'output':'',
              'count':'100'}

# request the url, and then parse the response.
response = requests.get(url = endpoint, params = param_dict)
soup = BeautifulSoup(response.content, 'lxml')

# print status code
print(response.status_code)
print(response.url)

200
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0000789019&type=10-Q&dateb=&owner=exclude&start=&output=&count=100


In [49]:
# this list will save all interative data links
I_D_links = []

# get all interactive data financial statements
table = soup.find(class_="tableFile2")
dir(table)
# rows = table.findAllNext(id="interactiveDataBtn")
# rows[0]

['HTML_FORMATTERS',
 'XML_FORMATTERS',
 '__bool__',
 '__call__',
 '__class__',
 '__contains__',
 '__copy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_all_strings',
 '_attr_value_as_string',
 '_attribute_checker',
 '_find_all',
 '_find_one',
 '_formatter_for_name',
 '_is_xml',
 '_lastRecursiveChild',
 '_last_descendant',
 '_select_debug',
 '_selector_combinators',
 '_should_pretty_print',
 '_tag_name_matches_and',
 'append',
 'attribselect_re',
 'attrs',
 'can_be_empty_element',
 'childGenerator',
 'children',
 'clear',
 'contents',
 'decode',
 'decode_contents',
 'decomp

In [88]:
rows = table.find_all("tr")[1:]
# for row in rows:
#     print()
# "interactiveDataBtn" in rows[0].decode()
# rows[0]
# rows[0].find_all("td")[3].string
# rows[0].find(id="interactiveDataBtn").get('href')

In [10]:
F_S_links = []
for l in I_D_links:
    endpoint = l
    response = requests.get(url = endpoint)
    soup = BeautifulSoup(response.content, 'lxml')
    entries = soup.find("a", string="View Excel Document")
    F_S_links.append("https://www.sec.gov" + entries.get('href'))
    print(entries.get('href'))
    

/Archives/edgar/data/1018724/000101872420000021/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872420000010/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872419000089/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872419000071/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872419000043/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872418000159/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872418000108/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872418000072/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872417000135/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872417000100/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872417000051/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872416000324/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872416000286/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872416000227/Financial_Report.xlsx
/Archives/edgar/data

In [11]:
F_S_links

['https://www.sec.gov/Archives/edgar/data/1018724/000101872420000021/Financial_Report.xlsx',
 'https://www.sec.gov/Archives/edgar/data/1018724/000101872420000010/Financial_Report.xlsx',
 'https://www.sec.gov/Archives/edgar/data/1018724/000101872419000089/Financial_Report.xlsx',
 'https://www.sec.gov/Archives/edgar/data/1018724/000101872419000071/Financial_Report.xlsx',
 'https://www.sec.gov/Archives/edgar/data/1018724/000101872419000043/Financial_Report.xlsx',
 'https://www.sec.gov/Archives/edgar/data/1018724/000101872418000159/Financial_Report.xlsx',
 'https://www.sec.gov/Archives/edgar/data/1018724/000101872418000108/Financial_Report.xlsx',
 'https://www.sec.gov/Archives/edgar/data/1018724/000101872418000072/Financial_Report.xlsx',
 'https://www.sec.gov/Archives/edgar/data/1018724/000101872417000135/Financial_Report.xlsx',
 'https://www.sec.gov/Archives/edgar/data/1018724/000101872417000100/Financial_Report.xlsx',
 'https://www.sec.gov/Archives/edgar/data/1018724/000101872417000051/F

In [12]:
# the most recent link to the xls doc
F_S_links[0]

'https://www.sec.gov/Archives/edgar/data/1018724/000101872420000021/Financial_Report.xlsx'

In [13]:
# lest download the document using pandas
doc = pd.ExcelFile(F_S_links[0])

In [14]:
# the xls have a lot of sheets, lets see one
df1 = pd.read_excel(df, 'Consolidated Balance Sheets')
df1

NameError: name 'df' is not defined

In [None]:
# to see all sheet names of the doc, we will use the sheet_names atribute
# it's a list wit all sheet names
doc.sheet_names

In [None]:
# it's a list wit all atributes
doc.sheet_names

In [None]:
# lest see one sheet using
pd.read_excel(doc, doc.sheet_names[1])