In [15]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


# Section One: Define the Parameters of the Search
To create a search we need to "build" a URL that takes us to a valid results query, this requires taking our base endpoint and attaching on different parameters to help narrow down our search. I'll do my best to explain how each of these parameters works, but unfortunately, there is no formal documentation on this.

Endpoint The endpoint for our EDGAR query is https://www.sec.gov/cgi-bin/browse-edgar if you go to this link without any additional parameters it will be an invalid request.

--------------------------------------------------------------------
### Parameters:

- **action:** (required) By default should be set to getcompany.

- **CIK**: (required) Is the CIK number of the company you are searching.

- **type**: (optional) Allows filtering the type of form. For example, if set to 10-k only the 10-K filings are returned.

- **dateb**: (optional) Will only return the filings before a given date. The format is as follows YYYYMMDD

- **owner:** (required) Is set to exclude by default and specifies ownership. You may also set it to include and only.

- **start:** (optional) Is the starting index of the results. For example, if I have 100 results but want to start at 45 of 100, I would pass 45.

- **state:** (optional) The company's state.

- **filenum:** (optional) The filing number.

- **sic:** (optional) The company's SIC (Standard Industry Classification) identifier
- **output:** (optional) Defines returned data structure as either xml (atom) or normal html.

- **count:** (optional) The number of results you want to see with your request, the max is 100 and if not set it will default to 40.

------------------------------------------------------------------------------
Now that we understand all the parameters let's make a request by defining our endpoint, and then a dictionary of our parameters. Where the key of the dictionary is the parameter name, and the value is the value we want to set for that parameter. Once we've defined these two components we can make our request and parse the response using BeautifulSoup.

In [2]:
# base URL for the SEC EDGAR browser
endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"

# define our parameters dictionary
param_dict = {'action':'getcompany',
              'CIK':' 1018724',
              'type':'10-Q',
              'dateb':'',
              'owner':'exclude',
              'start':'',
              'output':'',
              'count':'100'}

# request the url, and then parse the response.
response = requests.get(url = endpoint, params = param_dict)
soup = BeautifulSoup(response.content, 'lxml')

# print status code
print(response.status_code)
print(response.url)

200
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=+1018724&type=10-Q&dateb=&owner=exclude&start=&output=&count=100


In [3]:
# this list will save all interative data links
I_D_links = []

# get all interactive data financial statements
entries = soup.find_all(id="interactiveDataBtn")

for e in entries:
    I_D_links.append("https://www.sec.gov" + e.get('href'))

In [4]:
I_D_links

['https://www.sec.gov/cgi-bin/viewer?action=view&cik=1018724&accession_number=0001018724-20-000021&xbrl_type=v',
 'https://www.sec.gov/cgi-bin/viewer?action=view&cik=1018724&accession_number=0001018724-20-000010&xbrl_type=v',
 'https://www.sec.gov/cgi-bin/viewer?action=view&cik=1018724&accession_number=0001018724-19-000089&xbrl_type=v',
 'https://www.sec.gov/cgi-bin/viewer?action=view&cik=1018724&accession_number=0001018724-19-000071&xbrl_type=v',
 'https://www.sec.gov/cgi-bin/viewer?action=view&cik=1018724&accession_number=0001018724-19-000043&xbrl_type=v',
 'https://www.sec.gov/cgi-bin/viewer?action=view&cik=1018724&accession_number=0001018724-18-000159&xbrl_type=v',
 'https://www.sec.gov/cgi-bin/viewer?action=view&cik=1018724&accession_number=0001018724-18-000108&xbrl_type=v',
 'https://www.sec.gov/cgi-bin/viewer?action=view&cik=1018724&accession_number=0001018724-18-000072&xbrl_type=v',
 'https://www.sec.gov/cgi-bin/viewer?action=view&cik=1018724&accession_number=0001018724-17-0001

In [5]:
endpoint = 'https://www.sec.gov/cgi-bin/viewer?action=view&cik=789019&accession_number=0001564590-20-019706&xbrl_type=v'
response = requests.get(url = endpoint)

In [6]:
soup = BeautifulSoup(response.content, 'lxml')
entries = soup.find("a", string="View Excel Document")
entries.get('href')

'/Archives/edgar/data/789019/000156459020019706/Financial_Report.xlsx'

In [7]:
F_S_links = []
for l in I_D_links:
    endpoint = l
    response = requests.get(url = endpoint)
    soup = BeautifulSoup(response.content, 'lxml')
    entries = soup.find("a", string="View Excel Document")
    F_S_links.append("https://www.sec.gov" + entries.get('href'))
    print(entries.get('href'))
    

/Archives/edgar/data/1018724/000101872420000021/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872420000010/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872419000089/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872419000071/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872419000043/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872418000159/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872418000108/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872418000072/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872417000135/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872417000100/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872417000051/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872416000324/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872416000286/Financial_Report.xlsx
/Archives/edgar/data/1018724/000101872416000227/Financial_Report.xlsx
/Archives/edgar/data

In [8]:
F_S_links

['https://www.sec.gov/Archives/edgar/data/1018724/000101872420000021/Financial_Report.xlsx',
 'https://www.sec.gov/Archives/edgar/data/1018724/000101872420000010/Financial_Report.xlsx',
 'https://www.sec.gov/Archives/edgar/data/1018724/000101872419000089/Financial_Report.xlsx',
 'https://www.sec.gov/Archives/edgar/data/1018724/000101872419000071/Financial_Report.xlsx',
 'https://www.sec.gov/Archives/edgar/data/1018724/000101872419000043/Financial_Report.xlsx',
 'https://www.sec.gov/Archives/edgar/data/1018724/000101872418000159/Financial_Report.xlsx',
 'https://www.sec.gov/Archives/edgar/data/1018724/000101872418000108/Financial_Report.xlsx',
 'https://www.sec.gov/Archives/edgar/data/1018724/000101872418000072/Financial_Report.xlsx',
 'https://www.sec.gov/Archives/edgar/data/1018724/000101872417000135/Financial_Report.xlsx',
 'https://www.sec.gov/Archives/edgar/data/1018724/000101872417000100/Financial_Report.xlsx',
 'https://www.sec.gov/Archives/edgar/data/1018724/000101872417000051/F

In [30]:
# the most recent link to the xls doc
F_S_links[0]

'https://www.sec.gov/Archives/edgar/data/1018724/000101872420000021/Financial_Report.xlsx'

In [31]:
# lest download the document using pandas
doc = pd.ExcelFile(F_S_links[0])

In [32]:
# the xls have a lot of sheets, lets see one
df1 = pd.read_excel(df, 'Consolidated Balance Sheets')
df1

Unnamed: 0,Consolidated Balance Sheets - USD ($) $ in Millions,"Jun. 30, 2020","Dec. 31, 2019"
0,Current assets:,,
1,Cash and cash equivalents,37466.0,36092.0
2,Marketable securities,33925.0,18929.0
3,Inventories,19599.0,20497.0
4,"Accounts receivable, net and other",19918.0,20816.0
5,Total current assets,110908.0,96334.0
6,"Property and equipment, net",86517.0,72705.0
7,Operating leases,28537.0,25141.0
8,Goodwill,14751.0,14754.0
9,Other assets,17601.0,16314.0


In [34]:
# to see all sheet names of the doc, we will use the sheet_names atribute
# it's a list wit all sheet names
doc.sheet_names

['Document and Entity Information',
 'Consolidated Statements of Cash',
 'Consolidated Statements of Oper',
 'Consolidated Statements of Comp',
 'Consolidated Statements of Co_2',
 'Consolidated Balance Sheets',
 'Consolidated Balance Sheets (Pa',
 'Accounting Policies and Supplem',
 'Financial Instruments',
 'Leases',
 'Commitments and Contingencies',
 'Debt',
 "Stockholders' Equity",
 'Income Taxes',
 'Segment Information',
 'Accounting Policies and Suppl_2',
 'Accounting Policies and Suppl_3',
 'Financial Instruments (Tables)',
 'Leases (Tables)',
 'Commitments and Contingencies (',
 'Debt (Tables)',
 "Stockholders' Equity (Tables)",
 'Segment Information (Tables)',
 'Accounting Policies and Suppl_4',
 'Accounting Policies and Suppl_5',
 'Accounting Policies and Suppl_6',
 'Accounting Policies and Suppl_7',
 'Accounting Policies and Suppl_8',
 'Accounting Policies and Suppl_9',
 'Financial Instruments - Fair Va',
 'Financial Instruments - Contrac',
 'Financial Instruments - Equity '

In [27]:
# it's a list wit all atributes
doc.sheet_names

['Document and Entity Information',
 'Consolidated Statements of Cash',
 'Consolidated Statements of Oper',
 'Consolidated Statements of Comp',
 'Consolidated Statements of Co_2',
 'Consolidated Balance Sheets',
 'Consolidated Balance Sheets (Pa',
 'Accounting Policies and Supplem',
 'Financial Instruments',
 'Leases',
 'Commitments and Contingencies',
 'Debt',
 "Stockholders' Equity",
 'Income Taxes',
 'Segment Information',
 'Accounting Policies and Suppl_2',
 'Accounting Policies and Suppl_3',
 'Financial Instruments (Tables)',
 'Leases (Tables)',
 'Commitments and Contingencies (',
 'Debt (Tables)',
 "Stockholders' Equity (Tables)",
 'Segment Information (Tables)',
 'Accounting Policies and Suppl_4',
 'Accounting Policies and Suppl_5',
 'Accounting Policies and Suppl_6',
 'Accounting Policies and Suppl_7',
 'Accounting Policies and Suppl_8',
 'Accounting Policies and Suppl_9',
 'Financial Instruments - Fair Va',
 'Financial Instruments - Contrac',
 'Financial Instruments - Equity '

In [29]:
# lest see one sheet using
pd.read_excel(doc, doc.sheet_names[1])

Unnamed: 0,Consolidated Statements of Cash Flows - USD ($) $ in Millions,3 Months Ended,Unnamed: 2,6 Months Ended,Unnamed: 4,12 Months Ended,Unnamed: 6
0,,"Jun. 30, 2020","Jun. 30, 2019","Jun. 30, 2020","Jun. 30, 2019","Jun. 30, 2020","Jun. 30, 2019"
1,Statement of Cash Flows [Abstract],,,,,,
2,"CASH, CASH EQUIVALENTS, AND RESTRICTED CASH, B...",27505,,36410,32173,22965,20536
3,OPERATING ACTIVITIES:,,,,,,
4,Net income,5243,2625,7778,6186,13180,12096
5,Adjustments to reconcile net income to net cas...,,,,,,
6,Depreciation and amortization of property and ...,5748,5202,11110,10056,22843,18097
7,Stock-based compensation,2601,1971,4358,3245,7977,6012
8,"Other operating expense (income), net",282,80,348,67,445,200
9,"Other expense (income), net",-769,-7,-204,-142,-310,152
