# Scraping SEC Website for Financials

In [1]:
pip install python-edgar

Collecting python-edgar
  Downloading python_edgar-3.1.3-py3-none-any.whl (8.6 kB)
Installing collected packages: python-edgar
Note: you may need to restart the kernel to use updated packages.
Successfully installed python-edgar-3.1.3


In [1]:
import edgar
import pandas as pd

import requests
import time

#download sec edgar index info (refer to files in same direxctory)
edgar.download_index("C:\\Users\\605089\\Desktop\\Personal Projects\\Financial", 2012, "your_name your@email.address", skip_all_present_except_last=False)

In [2]:
#read info from a selected year and quarter into a dataframe
csv = pd.read_csv('2019-QTR2.tsv', sep='\t', lineterminator='\n', names=None)

csv.columns.values[0] = 'Item'
csv

Unnamed: 0,Item
0,1000045|NICHOLAS FINANCIAL INC|4/A|2019-04-16|...
1,1000045|NICHOLAS FINANCIAL INC|4|2019-04-15|ed...
2,1000045|NICHOLAS FINANCIAL INC|4|2019-05-16|ed...
3,1000045|NICHOLAS FINANCIAL INC|4|2019-05-31|ed...
4,1000045|NICHOLAS FINANCIAL INC|4|2019-06-03|ed...
...,...
249579,9984|BARNES GROUP INC|4|2019-06-12|edgar/data/...
249580,9984|BARNES GROUP INC|8-K/A|2019-05-07|edgar/d...
249581,9984|BARNES GROUP INC|8-K|2019-04-26|edgar/dat...
249582,9984|BARNES GROUP INC|8-K|2019-05-06|edgar/dat...


In [5]:
#what company you're interested in (refer to index itself for precise name)
selectedcompany = 'Booz Allen Hamilton Holding Corp'

#what document you're interested in
selectedreport = '10-K'

#select the item in which the company and document you are interested in is located
companyreport = csv[(csv['Item'].str.contains(selectedcompany)) & (csv['Item'].str.contains(selectedreport))]
companyreport

Unnamed: 0,Item
84591,1443646|Booz Allen Hamilton Holding Corp|10-K|...


In [4]:
#split the item into its components
Filing = companyreport['Item'].str.split('|')
Filing = Filing.to_list()
Filing

[['1443646',
  'Booz Allen Hamilton Holding Corp',
  '10-K',
  '2019-05-28',
  'edgar/data/1443646/0001443646-19-000093.txt',
  'edgar/data/1443646/0001443646-19-000093-index.html\r']]

In [37]:
#search for the html info within the item
for item in Filing[0]:
    item = item.replace('\r', '')
    if 'html' in item:
        report = item
    
report

'edgar/data/1443646/0001443646-19-000093-index.html'

In [38]:
#use the html info and the base url to get a url for the document's location
url = 'https://www.sec.gov/Archives/' + report
url

'https://www.sec.gov/Archives/edgar/data/1443646/0001443646-19-000093-index.html'

## Initial Scrub of SEC Website

In [58]:
import requests
import time

#define user-agent of browser
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.62"}

#get the information from the url
r_documents = requests.get(url, headers=headers)
time.sleep(10)

#check if status code is 200 (success)
r_documents.status_code


b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n<meta http-equiv="Last-Modified" content="Tue, 28 May 2019 11:08:00 GMT" />\n<title>EDGAR Filing Documents for 0001443646-19-000093</title>\n<link rel="stylesheet" type="text/css" href="/include/interactive.css" />\n</head>\n<body style="margin: 0">\n<!-- SEC Web Analytics - For information please visit: https://www.sec.gov/privacy.htm#collectedinfo -->\n<noscript><iframe src="//www.googletagmanager.com/ns.html?id=GTM-TD3BKV"\nheight="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>\n<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({\'gtm.start\':\nnew Date().getTime(),event:\'gtm.js\'});var f=d.getElementsByTagName(s)[0],\nj=d.createElement(s),dl=l!=\'dataLayer\'?\'&l=\'+l:\'\';j.async=true;j.src=\n\'//www.googletagmanager.com/gtm.

In [78]:
#store the html from the website containing the list of relevant documents from the asked for quarter
document_df = pd.read_html(r_documents.content)

#store the first table from the url into a dataframe
document_index = document_df[0]
document_index = document_index.dropna()

#from the above table, pull out the row that contains our selected report 
document_row = document_index[document_index['Description'].str.contains(selectedreport)]

#get the document's name that allows us to specify its url
document_name = document_row['Document'].str.split(' ')[0][0]
document_name

Unnamed: 0,Seq,Description,Document,Type,Size
0,1.0,10-K,bah-20190331x10k.htm,10-K,2949655


In [90]:
#get the url for the document

print(report)
report_formatted = report.replace('-', '').replace('index.html', '')
print(report_formatted)
url_document = 'https://www.sec.gov/Archives/'+report_formatted + '/' + document_name
print(url_document)

edgar/data/1443646/0001443646-19-000093-index.html
edgar/data/1443646/000144364619000093
https://www.sec.gov/Archives/edgar/data/1443646/000144364619000093/bah-20190331x10k.htm


Now that we have the url for the document we want to scrape, let us put the HTML information of this document into a Pandas dataframe

## Store Document in Dataframe

In [148]:
#establish your systems user-agent for accessing website
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.62"}

#run a request
r = requests.get(url_document, headers=headers)
#time.sleep(10)

#store the content in a dataframe
filing_df = pd.read_html(r.content)
filing_df

[    0   1   2   3   4
 0 NaN NaN NaN NaN NaN
 1 NaN NaN NaN NaN NaN
 2 NaN NaN NaN NaN NaN,
      0  \
 0  NaN   
 1    ý   
 
                                                                                       1  
 0                                                                                   NaN  
 1  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934  ,
      0  \
 0  NaN   
 1    ¨   
 
                                                                                           1  
 0                                                                                       NaN  
 1  TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934  ,
                                                                0   1  \
 0                                                            NaN NaN   
 1                                                            NaN NaN   
 2                                                       D

### Balance Sheet

In [113]:
#find the table corresponding to the Balance sheet in the html and store it in a dataframe
for item in filing_df:
    BS = (item[0].astype(str).str.contains('Retained')) #| item[0].astype(str).str.contains('Prepaid'))
    if BS.any():
        Balance_Sheet = item
        
Balance_Sheet

Unnamed: 0,0,1,2,3,4,5,6,7
0,BOOZ ALLEN HAMILTON HOLDING CORPORATIONCONSOLIDATED BALANCE SHEETS,BOOZ ALLEN HAMILTON HOLDING CORPORATIONCONSOLIDATED BALANCE SHEETS,BOOZ ALLEN HAMILTON HOLDING CORPORATIONCONSOLIDATED BALANCE SHEETS,BOOZ ALLEN HAMILTON HOLDING CORPORATIONCONSOLIDATED BALANCE SHEETS,BOOZ ALLEN HAMILTON HOLDING CORPORATIONCONSOLIDATED BALANCE SHEETS,BOOZ ALLEN HAMILTON HOLDING CORPORATIONCONSOLIDATED BALANCE SHEETS,BOOZ ALLEN HAMILTON HOLDING CORPORATIONCONSOLIDATED BALANCE SHEETS,BOOZ ALLEN HAMILTON HOLDING CORPORATIONCONSOLIDATED BALANCE SHEETS
1,,,,,,,,
2,,"March 31, 2019","March 31, 2019","March 31, 2019",,"March 31, 2018","March 31, 2018","March 31, 2018"
3,,"(Amounts in thousands, exceptshare and per share data)","(Amounts in thousands, exceptshare and per share data)","(Amounts in thousands, exceptshare and per share data)","(Amounts in thousands, exceptshare and per share data)","(Amounts in thousands, exceptshare and per share data)","(Amounts in thousands, exceptshare and per share data)","(Amounts in thousands, exceptshare and per share data)"
4,ASSETS,,,,,,,
5,Current assets:,,,,,,,
6,Cash and cash equivalents,$,283990,,,$,286958,
7,"Accounts receivable, net of allowance",1330364,1330364,,,1133705,1133705,
8,Prepaid expenses and other current assets,84986,84986,,,71309,71309,
9,Total current assets,1699340,1699340,,,1491972,1491972,


In [132]:
B_sheet = Balance_Sheet.iloc[2:,[0,2,6]]
B_sheet

Unnamed: 0,0,2,6
2,,"March 31, 2019","March 31, 2018"
3,,"(Amounts in thousands, exceptshare and per share data)","(Amounts in thousands, exceptshare and per share data)"
4,ASSETS,,
5,Current assets:,,
6,Cash and cash equivalents,283990,286958
7,"Accounts receivable, net of allowance",1330364,1133705
8,Prepaid expenses and other current assets,84986,71309
9,Total current assets,1699340,1491972
10,"Property and equipment, net of accumulated depreciation",172453,152364
11,"Intangible assets, net of accumulated amortization",287051,278504


In [146]:
#take only the columns of interest (some may be repeating)
B_sheet = Balance_Sheet.iloc[2:,[0,2,6]]

#create a new header using the top most row
header = B_sheet.iloc[0]

#remove the top row and move it to the header
B_sheet = B_sheet[1:]
B_sheet.columns = header

#create a new header for the items column
B_sheet.columns.values[0] = 'Item'

#remove an items that don't exist
B_sheet = B_sheet[B_sheet['Item'].notna()]

B_sheet

2,Item,"March 31, 2019","March 31, 2018"
4,ASSETS,,
5,Current assets:,,
6,Cash and cash equivalents,283990,286958
7,"Accounts receivable, net of allowance",1330364,1133705
8,Prepaid expenses and other current assets,84986,71309
9,Total current assets,1699340,1491972
10,"Property and equipment, net of accumulated depreciation",172453,152364
11,"Intangible assets, net of accumulated amortization",287051,278504
12,Goodwill,1581160,1581146
13,Other long-term assets,91837,102633


Clean the dataframe

In [147]:
#convert accounting numbers into strings
B_sheet[B_sheet.columns[1:]] = B_sheet[B_sheet.columns[1:]].astype(str)

#change ( into a negative number
B_sheet[B_sheet.columns[1]] = B_sheet[B_sheet.columns[1]].map(lambda x: x.replace('(', '-'))
B_sheet[B_sheet.columns[2]] = B_sheet[B_sheet.columns[2]].map(lambda x: x.replace('(', '-'))


#get rid of commas in large numbers
B_sheet[B_sheet.columns[1]] = B_sheet[B_sheet.columns[1]].map(lambda x: x.replace(',', ''))
B_sheet[B_sheet.columns[2]] = B_sheet[B_sheet.columns[2]].map(lambda x: x.replace(',', ''))

#more cleaning
B_sheet[B_sheet.columns[1]] = B_sheet[B_sheet.columns[1]].map(lambda x: x.replace('-', '0'))
B_sheet[B_sheet.columns[2]] = B_sheet[B_sheet.columns[2]].map(lambda x: x.replace('-', '0'))


#convert to float
B_sheet[B_sheet.columns[1:]] = B_sheet[B_sheet.columns[1:]].astype(float)
B_sheet

2,Item,"March 31, 2019","March 31, 2018"
4,ASSETS,,
5,Current assets:,,
6,Cash and cash equivalents,283990.0,286958.0
7,"Accounts receivable, net of allowance",1330364.0,1133705.0
8,Prepaid expenses and other current assets,84986.0,71309.0
9,Total current assets,1699340.0,1491972.0
10,"Property and equipment, net of accumulated depreciation",172453.0,152364.0
11,"Intangible assets, net of accumulated amortization",287051.0,278504.0
12,Goodwill,1581160.0,1581146.0
13,Other long-term assets,91837.0,102633.0
