# Data Collection
This file contains the code for all of the data collection for this project. The data was collected from the SEC edgar database.

# Load Libraries

In [1]:
import requests
import urllib
from bs4 import BeautifulSoup
import time
from multiprocessing import Pool as ThreadPool
import pandas as pd
from json import JSONDecodeError
import time
from functools import reduce
import numpy as np
from datetime import datetime

# Functions
These functions will be used later in the notebook 

In [2]:
#This function creates the url for the daily index files that will be pulled
def make_url(base_url, comp):
  url = base_url
  for r in comp:
    url='{}/{}'.format(url,r)
  return url

In [None]:
#This function uses multithreading to scrape webpages faster than a loop would
#It outputs the time it takes to pull the specified number of items
#Make sure this number stays above 1 second per 10 items or edgar will block you from scraping
def main(func, lst, num_workers):
    t0 = time.time()
    #create list of lists containing file counts for each type of file
    with ThreadPool(num_workers) as pool:
      df_lst = pool.map(func, lst)
    t1 = time.time()
    print(f"{(t1-t0)} seconds to read {len(lst)} items.")
    return df_lst

In [3]:
#grabbing the daily index files
#the daily index files contain information on all documents that were filed on a particular day
file_lst=[]
years = range(1994, 2022, 1)
year_lst=[]
base_url=r'https://www.sec.gov/Archives/edgar/daily-index'
#loop through every year in the directory
for year in years:
  year = str(year)
#This url collects the files
  year_url = make_url(base_url, [year,'index.json'])
  
  content = requests.get(year_url, headers={'User-Agent': 'Mozilla/5.0'})
  decoded_content = content.json()
#get the url for every quarter in the directory
  for item in decoded_content['directory']['item']:
    qtr_url = make_url(base_url, [year, item['name'], 'index.json'])

    file_content = requests.get(qtr_url, headers={'User-Agent': 'Mozilla/5.0'})
    decoded_content=file_content.json()
# get file urls for every quarter
    for file in decoded_content['directory']['item']:
      file_url=make_url(base_url, [year, item['name'], file['name']])
      file_lst.append(file_url)

In [4]:
#takes the master files and puts them into one list
master_idx=[]
for file in file_lst:
  if 'master' in file:
    master_idx.append(file)

In [47]:
#this loop creates a list of dictionaries for forms that were filed for the quarters that were previously pulled
error_idx=[]
master_lst=[]
for index, item in enumerate(master_idx):
#open and read each file url in the master list
#this decodes the bytes into a list where the later indices of the list have file information
  file_url=item.replace('.gz', '')
  content = requests.get(file_url, headers={'User-Agent': 'Mozilla/5.0'}).content

  with open('master_url.txt', 'wb') as f:
    f.write(content)

  with open('master_url.txt', 'rb') as f:
    byte_data = f.read()
    #There were errors in decoding some of the files so I skipped those
  try:
    data = byte_data.decode('utf-8').split('  ')
  except UnicodeDecodeError:
    error_idx.append(index)
    continue

#find the index in the data list where file info starts
  for index, item in enumerate(data):
    if 'data/' in item:
      start_idx = index
      break

  data_format = data[start_idx:]
#Only include indices that have file info, so anything with a length of zero isn't useful
  data_reformatted=[]
  for index, item in enumerate(data_format):
    if len(item) !=0:
      data_reformatted.append(item)

  master_data = []
#loop through data list
  for index, item in enumerate(data_reformatted):
        #some stuff at the beginning of this long list needs to be removed
    if index==0:
      clean_item_data=item.replace('\n', '|').split('|')
      #clean_item_data=clean_item_data[8:]
    else:
      clean_item_data=item.replace('\n', '|').split('|')
#grab all item that end in .txt because that indicates the file name
    for index, row in enumerate(clean_item_data):
      if '.txt' in row:
        mini_lst = clean_item_data[(index-4):index+1]
#create the url to grab the file
        if len(mini_lst) != 0:
          mini_lst[4]='https://www.sec.gov/Archives/' + mini_lst[4]
          
          master_data.append(mini_lst)
#create a dictionary that will be added to the master list of all files
  for index, document in enumerate(master_data):
    document_dict={}
    document_dict['cik'] = document[0]
    document_dict['company'] = document[1]
    document_dict['form'] = document[2]
    document_dict['date'] = document[3]
    document_dict['url'] = document[4]

    master_data[index]=document_dict
  master_lst.append(master_data)

In [73]:
#pull 10-k and 10-q files from master list
quarterly_list=[]
yearly_list=[]
for master_data in master_lst:
    for document_dict in master_data:
        if document_dict['form']=='10-Q':
            quarterly_list.append(document_dict)
        if document_dict['form']=='10-K':
            yearly_list.append(document_dict)

In [49]:
#convert both to dataframes to save for later
quarterly_df = pd.DataFrame(quarterly_list)
quarterly_df.to_csv('10-Q.csv')

yearly_df = pd.DataFrame(yearly_list)
yearly_df.to_csv('10-K.csv')

In [3]:
#load 10-q file info
quarterly_df=pd.read_csv('10-Q.csv')
quarterly_df.drop("Unnamed: 0", axis=1, inplace=True)
quarterly_df.head()

Unnamed: 0,cik,company,date,form,url
0,275605,GENERAL HOST CORP,19940701,10-Q,https://www.sec.gov/Archives/data/40638/000095...
1,61425,GIANT FOOD INC,19940701,10-Q,https://www.sec.gov/Archives/data/41289/000004...
2,763043,UNIVAR CORP,19940628,10-Q,https://www.sec.gov/Archives/data/101929/00001...
3,800042,HARNISCHFEGER INDUSTRIES INC,19940614,10-Q,https://www.sec.gov/Archives/data/801898/00008...
4,841289,RITE AID CORP,19940628,10-Q,https://www.sec.gov/Archives/data/84129/000089...


In [4]:
#convert to list of dictionaries
quarterly_list=quarterly_df.to_dict('records')
quarterly_list[0:5]

[{'cik': 275605,
  'company': 'GENERAL HOST CORP',
  'date': 19940701,
  'form': '10-Q',
  'url': 'https://www.sec.gov/Archives/data/40638/0000950124-94-001209.txt'},
 {'cik': 61425,
  'company': 'GIANT FOOD INC',
  'date': 19940701,
  'form': '10-Q',
  'url': 'https://www.sec.gov/Archives/data/41289/0000041289-94-000003.txt'},
 {'cik': 763043,
  'company': 'UNIVAR CORP',
  'date': 19940628,
  'form': '10-Q',
  'url': 'https://www.sec.gov/Archives/data/101929/0000101929-94-000016.txt'},
 {'cik': 800042,
  'company': 'HARNISCHFEGER INDUSTRIES INC',
  'date': 19940614,
  'form': '10-Q',
  'url': 'https://www.sec.gov/Archives/data/801898/0000801898-94-000010.txt'},
 {'cik': 841289,
  'company': 'RITE AID CORP',
  'date': 19940628,
  'form': '10-Q',
  'url': 'https://www.sec.gov/Archives/data/84129/0000893220-94-000311.txt'}]

In [7]:
#grab 1/8 of list for time purposes
quarterly_4=quarterly_list[482004:562398]

In [8]:
len(quarterly_4)

80394

In [18]:
#This function grabs the xml link that has a summary of all of the financial statements in the report
def get_xml(dictionary):
  time.sleep(0.7)
  base_url='https://www.sec.gov'
  base_dict = {}
  base_dict['cik'] = dictionary['cik']
  if 'data/' in dictionary['url']:
    documents_url = dictionary['url'].replace('-','').replace('.txt', '/index.json')
    try:
        content=requests.get(documents_url, headers={'User-Agent': 'Mozilla/5.0'}).json()
    except JSONDecodeError:
        return base_dict
    xml_summary = ''
    #loop through the documents_url webpage to find the xml summary link
    for file in content['directory']['item']:
        if file['name'] == 'FilingSummary.xml':
            xml_summary = base_url + content['directory']['name']+'/'+file['name']
            xml_dict={}
            xml_dict['cik'] = dictionary['cik']
            xml_dict['date'] = dictionary['date']
            xml_dict['xml'] = xml_summary
            
            return xml_dict
    if len(xml_summary)==0:
      return base_dict

  else:
    return base_dict

In [20]:
#run main function
xml_list = main(get_xml, quarterly_4, 9)

10593.458915233612 seconds to read 80394 items.


In [22]:
#sometime the url won't read correctly. this loop gets rid of those
new_xml=[]
for i in range(len(xml_list)):
    if len(xml_list[i])==3:
        new_xml.append(xml_list[i])
len(new_xml)

76391

In [None]:
#save as csv for later
xml_qtr = pd.DataFrame(xml_list)
xml_qtr.to_csv('10-Q_xml.csv')

In [4]:
#load data
xml_qtr_4 = pd.read_csv('10-Q_xml_4.csv')
xml_qtr_4.drop("Unnamed: 0", axis=1, inplace=True)
xml_qtr_4.head()

Unnamed: 0,cik,date,xml
0,1084048,20161109.0,https://www.sec.gov/Archives/edgar/data/108404...
1,1084961,20161109.0,https://www.sec.gov/Archives/edgar/data/108496...
2,1087934,20161109.0,https://www.sec.gov/Archives/edgar/data/108793...
3,1091171,20161109.0,https://www.sec.gov/Archives/edgar/data/109117...
4,1092289,20161109.0,https://www.sec.gov/Archives/edgar/data/109228...


In [6]:
#convert to list of dictionaries
xml_list_4=xml_qtr_4.to_dict('records')
xml_list_4[0:5]

[{'cik': 1084048,
  'date': 20161109.0,
  'xml': 'https://www.sec.gov/Archives/edgar/data/1084048/000108404816000026/FilingSummary.xml'},
 {'cik': 1084961,
  'date': 20161109.0,
  'xml': 'https://www.sec.gov/Archives/edgar/data/1084961/000108496116000226/FilingSummary.xml'},
 {'cik': 1087934,
  'date': 20161109.0,
  'xml': 'https://www.sec.gov/Archives/edgar/data/1087934/000156459016028621/FilingSummary.xml'},
 {'cik': 1091171,
  'date': 20161109.0,
  'xml': 'https://www.sec.gov/Archives/edgar/data/1091171/000109117116000307/FilingSummary.xml'},
 {'cik': 1092289,
  'date': 20161109.0,
  'xml': 'https://www.sec.gov/Archives/edgar/data/1092289/000156459016028702/FilingSummary.xml'}]

In [7]:
#This does the same thing as the loop above, but you can run it after loading the data if you didn't before saving
xml_new=[]
for i in range(len(xml_list_4)):
  if type(xml_list_4[i]['xml'])==str:
    xml_new.append(xml_list_4[i])
len(xml_new)

77541

In [25]:
#this function creates a master reports list
#it uses keywords to look for balance sheets, income statements, and statements of cashflows
def get_reports(dictionary):
    master_reports=[]
    time.sleep(0.5)
    base_dict={}
    base_dict['cik']=dictionary['cik']
    #using the xml summary to get statement links
    base_url = dictionary['xml'].replace('FilingSummary.xml', '')

    content=requests.get(dictionary['xml'], headers={'User-Agent': 'Chrome/39.0.2171.95'}).content
    soup=BeautifulSoup(content, 'lxml')

    reports=soup.find('myreports')
    #to account for the link not reading correctly
    if reports is None:
        return base_dict

    #loops through all reports in the filing and grabs the ones of interest based on the list of keywords below
    for report in reports.find_all('report')[:-1]:
        report_dict={}
        report_dict['cik'] = dictionary['cik']
        report_dict['date'] = dictionary['date']
        report_dict['short_name'] = report.shortname.text.lower()
        report_dict['long_name'] = report.longname.text.lower()
        #report_dict['position'] = report.position.text.lower()
        #report_dict['menu_category'] = report.menucategory.text.lower()
        report_dict['url'] = base_url + report.htmlfilename.text

        keyword_list= ['balance sheet', 'balance sheets', 'income statement','statement of income', 
                       'statement of comprehensive income', 'statements of income', 
                       'statements of comprehensive income','cash flows']
        #ignore these words
        keyword_list1=['derivative', 'parenthetical', 'note', 'notes', 'detail']
        if any(ele in report_dict['short_name'] for ele in keyword_list1):
            continue

        elif (any(ele in report_dict['short_name'] for ele in keyword_list)):
            master_reports.append(report_dict)
    return master_reports

In [26]:
master_reports = main(get_reports, new_xml, 9)

8129.779483556747 seconds to read 76391 items.


In [39]:
#similar to the xml list, this corrects for any reports that were not read in correctly
new_master=[]
for i in range(len(master_reports)):
    if (len(master_reports[i])>0) and (type(master_reports[i])==list):
        for j in range(len(master_reports[i])):
            new_master.append(master_reports[i][j])
new_master[0:5]

[{'cik': 814586,
  'date': 20121114,
  'long_name': '00200 - statement - consolidated statements of income and comprehensive income',
  'short_name': 'consolidated statements of income and comprehensive income',
  'url': 'https://www.sec.gov/Archives/edgar/data/814586/000107261312000663/R4.htm'},
 {'cik': 814586,
  'date': 20121114,
  'long_name': '00400 - statement - consolidated statements of cash flows',
  'short_name': 'consolidated statements of cash flows',
  'url': 'https://www.sec.gov/Archives/edgar/data/814586/000107261312000663/R7.htm'},
 {'cik': 814926,
  'date': 20121114,
  'long_name': '000020 - statement - consolidated balance sheets',
  'short_name': 'consolidated balance sheets',
  'url': 'https://www.sec.gov/Archives/edgar/data/814926/000093980212000267/R2.htm'},
 {'cik': 814926,
  'date': 20121114,
  'long_name': '000050 - statement - consolidated statements of cash flows',
  'short_name': 'consolidated statements of cash flows',
  'url': 'https://www.sec.gov/Archives

In [41]:
#save for later
reports_qtr = pd.DataFrame(new_master)
reports_qtr.to_csv('10-Q_reports.csv')

In [60]:
#this function uses the links in the master reports list to get the data in the reports
def get_data(dictionary):
    time.sleep(0.7)
    #headers contain document title, scale of reporting, and dates
    #sections are the main secitons of the reports
    #data contains numerical values and subsections
    statement_data={}
    statement_data['cik']=dictionary['cik']
    statement_data['headers']=[]
    statement_data['sections']=[]
    statement_data['data']=[]

    statement = dictionary['url']
    content=requests.get(statement, headers={'User-Agent': 'Mozilla/5.0'}).content
    report_soup=BeautifulSoup(content, 'lxml')
    if report_soup.table is None:
        return statement_data
    
    report_soup = report_soup.table.find_all('tr')
    for index, row in enumerate(report_soup):
        if row is None:
            return statement_data

        cols=row.find_all('td')
#these if statements put the sections of the reports in the right place in the dictionary
        if (len(row.find_all('th'))==0 and len(row.find_all('strong'))==0):
            reg_row=[ele.text.strip() for ele in cols]
            statement_data['data'].append(reg_row)

        elif (len(row.find_all('th'))==0 and len(row.find_all('strong'))!=0):
            sec_row= cols[0].text.strip()
            statement_data['sections'].append(sec_row)

        elif (len(row.find_all('th'))!=0):
            head_row=[ele.text.strip() for ele in row.find_all('th')]
            statement_data['headers'].append(head_row)
    return statement_data

In [79]:
#making it shorter for time purposes
new_master1 = new_master[101786:203571]
len(new_master1)

101785

In [None]:
statements_data=main(get_data, new_master1, 9)

In [83]:
#setting the scale based on the values listed in headers
for i in range(len(statements_data)):
    if len(statements_data[i]['headers'])==0:
        continue
    if 'Thousands' in  statements_data[i]['headers'][0][0]:
        statements_data[i]['scale']='thousands'
    elif 'Millions' in  statements_data[i]['headers'][0][0]:
        statements_data[i]['scale']='millions'
    else:
        statements_data[i]['scale']='ones'

In [84]:
#sorting the reports by report type
bs_data=[]
is_data=[]
cf_data=[]
for i in range(len(statements_data)):
    if len(statements_data[i]['headers'])==0:
        continue
    statements_data[i]['headers'][0][0] = statements_data[i]['headers'][0][0].lower()
    if 'balance' in statements_data[i]['headers'][0][0]:
        bs_data.append(statements_data[i])
    elif 'income' in statements_data[i]['headers'][0][0]:
        is_data.append(statements_data[i])
    elif 'cash' in statements_data[i]['headers'][0][0]:
        cf_data.append(statements_data[i])
print('Number of balance sheets: '+ str(len(bs_data)))
print('Number of income statements: '+ str(len(is_data)))
print('Number of statements of cash flows: ' +str(len(cf_data)))

Number of balance sheets: 39116
Number of income statements: 24544
Number of statements of cash flows: 38122


In [85]:
#creating a list of two row datasets for each individual balance sheet
bs_lst=[]
for i in range(len(bs_data)):
    i = i
    headers = bs_data[i]['headers'][0]
    if len(headers)!= 3:
        continue
    else:
        data = bs_data[i]['data']
        totals=[]
        for j in range(len(data)):
        #manipulating the data to make the sections more uniform
          data[j][0]=data[j][0].lower()
          data[j][0]=data[j][0].replace("'", "").replace("’", '')
          data[j][0]=data[j][0].replace('stockholders', 'shareholders').replace('deficit', 'equity')
        #searching for the three main categories and renaming them for uniformity sake
          if ('total assets' in data[j][0]):
            data[j][0]='total assets'
            totals.append(data[j])
          elif ('total liabilities' in data[j][0]):
            data[j][0]='total liabilities'
            totals.append(data[j])
          elif ('total' in data[j][0]) and ("shareholders equity" in data[j][0]):
            data[j][0]='total shareholders equity'
            totals.append(data[j])
        if (len(totals)<3):
            continue
        elif (len(totals[0]) <3):
            continue
    #get dates, scale and cik for dataframe
        else:
          dates = headers[1:3]
          cik = [bs_data[i]['cik']]*2
          scale = [bs_data[i]['scale']]*2
          df = pd.DataFrame(totals)
          df = df.set_index(0).T
          if len(df)>2:
              continue
          else:
              df.insert(0, 'dates', dates)
              df.insert(0, 'scale', scale)
              df.insert(0, 'cik', cik)
              df=df.loc[:,~df.columns.duplicated()]
              bs_lst.append(df)

In [86]:
def concat_df(df1,df2): return pd.concat([df1,df2])

In [87]:
#concatenate all dataframes in the list
bs_df = reduce(concat_df, bs_lst)
bs_df.reset_index(inplace=True)
bs_df.drop('index', axis=1, inplace=True)
bs_df.head()

Unnamed: 0,cik,dates,scale,total assets,total liabilities,total shareholders equity
0,8858,"Dec. 27, 2014",thousands,"11,491,094us-gaap_Assets","6,786,164us-gaap_Liabilities","4,704,930us-gaap_StockholdersEquity"
1,8858,"Jun. 28, 2014",thousands,"11,255,517us-gaap_Assets","6,365,324us-gaap_Liabilities","4,890,193us-gaap_StockholdersEquity"
2,1023731,"Dec. 31, 2014",thousands,"308,018us-gaap_Assets","22,237us-gaap_Liabilities","285,781us-gaap_StockholdersEquity"
3,1023731,"Mar. 31, 2014",thousands,"299,203us-gaap_Assets","21,025us-gaap_Liabilities","278,178us-gaap_StockholdersEquity"
4,1040130,"Dec. 31, 2014",thousands,"79,842us-gaap_Assets","6,295us-gaap_Liabilities","73,547us-gaap_StockholdersEquity"


In [88]:
#save as csv
bs_df.to_csv('bs_df4.csv')

In [94]:
#this loop puts all income statements into a list of dataframes
#the format is the same as for the balance sheets but has code specific for the information found in an income statement
is_lst=[]
for i in range(len(is_data)):
    i = i
    headers = is_data[i]['headers'][0]
    if len(headers)!= 3:
        continue
    else:
        data = is_data[i]['data'][0:3]
        totals=[]
        for j in range(len(data)):
          data[j][0]=data[j][0].lower()
          data[j][0]=data[j][0].replace("loss", "income")
          #data[j][0]=data[j][0].replace('stockholders', 'shareholders').replace('deficit', 'equity')
        #looking for net income and renaming
          if ('net' in data[j][0]) and ('income' in data[j][0]):
            data[j][0]='net income'
            totals.append(data[j][0:3])
        
        if len(totals)==0:
          continue
        elif len(totals[0])<3:
            continue
        elif len(is_data[i]['headers'])!=2:
            continue
        else:
          dates = is_data[i]['headers'][1][0:2]
          cik = [is_data[i]['cik']]*2
          scale = [is_data[i]['scale']]*2
          df = pd.DataFrame(totals)
          df = df.set_index(0).T
          if len(df)>2:
              continue
          else:
              df.insert(0, 'dates', dates)
              df.insert(0, 'scale', scale)
              df.insert(0, 'cik', cik)
              df=df.loc[:,~df.columns.duplicated()]
              is_lst.append(df)

In [95]:
#concatenating dataframes
is_df = reduce(concat_df, is_lst)
is_df.reset_index(inplace=True)
is_df.drop('index', axis=1, inplace=True)
is_df.head()

Unnamed: 0,cik,scale,dates,net income
0,8858,thousands,"Dec. 27, 2014","$ 163,706us-gaap_NetIncomeLoss"
1,8858,thousands,"Dec. 28, 2013","$ 124,864us-gaap_NetIncomeLoss"
2,1023731,thousands,"Dec. 31, 2014",$ 444us-gaap_NetIncomeLoss
3,1023731,thousands,"Dec. 31, 2013",$ 89us-gaap_NetIncomeLoss
4,789019,millions,"Dec. 31, 2014","$ 5,863us-gaap_NetIncomeLoss"


In [96]:
#save to dataframe
is_df.to_csv('is_df4.csv')

In [98]:
#this loop is the same as the previous two in format but spefic to statements of cashflows
cf_lst=[]
for i in range(len(cf_data)):
    i = i
    headers = cf_data[i]['headers'][0]
    data = cf_data[i]['data']
    totals=[]
    for j in range(len(data)):
      data[j][0]=data[j][0].lower()
      #data[j][0]=data[j][0].replace("loss", "income")
      #data[j][0]=data[j][0].replace('stockholders', 'shareholders').replace('deficit', 'equity')
      if ('cash equivalents' in data[j][0]) and ('end' in data[j][0]):
        data[j][0]='cash equivalents'
        totals.append(data[j])
    
    if len(totals)==0:
      continue
    elif len(totals[0])<3:
            continue
    else:
        if (len(cf_data[i]['headers'])==2):
          dates = cf_data[i]['headers'][1][0:2]
          if len(dates)==2:
              cik = [cf_data[i]['cik']]*2
              scale = [cf_data[i]['scale']]*2
              df = pd.DataFrame(totals)
              df = df.set_index(0).T
              if len(df)>2:
                  continue
              else:
                  df.insert(0, 'dates', dates)
                  df.insert(0, 'scale', scale)
                  df.insert(0, 'cik', cik)
                  df=df.loc[:,~df.columns.duplicated()]
                  cf_lst.append(df)

In [99]:
#concatenating dataframes
cf_df = reduce(concat_df, cf_lst)
cf_df.reset_index(inplace=True)
cf_df.drop('index', axis=1, inplace=True)
cf_df.head()

Unnamed: 0,cik,scale,dates,cash equivalents
0,319201,thousands,"Dec. 31, 2014","584,865us-gaap_CashAndCashEquivalentsAtCarryin..."
1,319201,thousands,"Dec. 31, 2013","793,382us-gaap_CashAndCashEquivalentsAtCarryin..."
2,1023731,thousands,"Dec. 31, 2014","52,598us-gaap_CashAndCashEquivalentsAtCarrying..."
3,1023731,thousands,"Dec. 31, 2013","172,114us-gaap_CashAndCashEquivalentsAtCarryin..."
4,1040130,thousands,"Dec. 31, 2014","37,068us-gaap_CashAndCashEquivalentsAtCarrying..."


In [100]:
#save as csv
cf_df.to_csv('cf_df4.csv')