# PHASE 1: DATA ACQUISITION AND INGESTION
## DATA ACQUISITION

The purpose of this module is to  
1. Download the Financial Statement Data Sets from the SEC website: https://www.sec.gov/dera/data/financial-statement-data-sets.html  
2. Unzip the data and store locally.

In [1]:
# to scrape the SEC page
from bs4 import BeautifulSoup

# to download the zip file data
import requests

# to extract the zip files
import zipfile

# to interpret the zip file stream from requests
# https://stackoverflow.com/questions/9419162/download-returned-zip-file-from-url
import io

# to work with paths
import os

In [2]:
# Open the SEC website
page_url = 'https://www.sec.gov/dera/data/financial-statement-data-sets.html'
r = requests.get(page_url)
html_doc = r.text

# To keep track of all year-quarters (eg 2014q3) discovered
list_of_year_quarters = []

# Find all hyperlinks pointing to .zip files
soup = BeautifulSoup(html_doc)
for a in soup.find_all('a', href=True):
    if '.zip' in a['href']:
        # The hrefs are relative to the domain: add domain.
        zip_file_url = 'https://www.sec.gov'+a['href']
        year_quarter = zip_file_url[-10:-4]
        list_of_year_quarters.append(year_quarter)
        
        # Avoid duplicate downloading
        if not year_quarter in os.listdir('./SEC_Datasets'):
            # Stream the file and print status update. Then, extract the zip file.
            zip_data = requests.get(zip_file_url, stream=True)
            with zipfile.ZipFile(io.BytesIO(zip_data.content), 'r') as zip_data_file:
                zip_data_file.extractall('./SEC_Datasets/'+year_quarter)

## DATA INGESTION

The purpose of this module is to  
1. Reads the raw .txt data files, acquired from SEC.gov, into Pandas DataFrames.
2. Merges all quarterly files into a dictionary of DataFrames.
3. Pickle this dictionary for future use.
4. Generate and pickle a list of company names and state present in this dataset. This will form the input for the data gathering module.

In [3]:
# to work with data
import pandas as pd

# to write dictionary to disk
import pickle

In [4]:
# Read files into Pandas
# The desired object is a DICTIONARY of DATAFRAMES. Each DATAFRAME consists of all quarterly
# files appended. eg. DataFrames['NUM'] yields a large dataframe with all NUM data across
# all datasets downloaded.

data_file_names = ['NUM','PRE','SUB','TAG']

# to avoid duplicate effort during editing.
if 'dict_of_dfs_num_pre_sub_tag.p' in os.listdir('.'):
    with open('dict_of_dfs_num_pre_sub_tag.p', 'rb') as picklefile:
        DataFrames = pickle.load(picklefile)
else:    
    DataFrames = {}
def append_to_df_dict(txt_filepath, filename, dict_of_df=DataFrames):
    """Accepts file path to SEC data txt-file and a dictionary of
    dataframes. Will append the appropriate df in the dict with 
    the data in the file."""
    df = pd.read_csv(txt_filepath, 
                     sep='\t', 
                     # This analyzes the entire file in 1 pass, improving accuracy of formating etc.
                     low_memory=False, 
                     header=0,
                     # Although the docs indicate utf-8 encoding, there exist non-compliant characters.
                     # Stack Overflow recommended trying latin-1, which works.
                     encoding='latin-1')    
    
    if filename in dict_of_df:
        dict_of_df[filename].append(df)
    else:
        dict_of_df[filename] = df
    print('processed ' + txt_filepath)

# to avoid duplicate effort during editing        
if not len(DataFrames) == len(data_file_names):
    # process all files
    for qtr in list_of_year_quarters:
        for name in data_file_names:
            filepath = "./SEC_Datasets/" + qtr + "/" + name + '.txt'
            append_to_df_dict(filepath, name)
else: print('Files already processed')

Files already processed


In [5]:
DataFrames['SUB'].head(10)

Unnamed: 0,adsh,cik,name,sic,countryba,stprba,cityba,zipba,bas1,bas2,...,period,fy,fp,filed,accepted,prevrpt,detail,instance,nciks,aciks
0,0000002178-18-000067,2178,"ADAMS RESOURCES & ENERGY, INC.",5172.0,US,TX,HOUSTON,77027,17 S. BRIAR HOLLOW LN.,,...,20180930,2018.0,Q3,20181107,2018-11-07 16:28:00.0,0,1,ae-20180930_htm.xml,1,
1,0000002488-18-000189,2488,ADVANCED MICRO DEVICES INC,3674.0,US,CA,SANTA CLARA,95054,2485 AUGUSTINE DRIVE,,...,20180930,2018.0,Q3,20181031,2018-10-31 16:15:00.0,0,1,amd-20180929.xml,1,
2,0000002969-18-000044,2969,AIR PRODUCTS & CHEMICALS INC /DE/,2810.0,US,PA,ALLENTOWN,18195-1501,7201 HAMILTON BLVD,,...,20180930,2018.0,FY,20181120,2018-11-20 14:48:00.0,0,1,apd-10xkx30sep2018_htm.xml,1,
3,0000003499-18-000023,3499,ALEXANDERS INC,6798.0,US,NJ,PARAMUS,07652,210 ROUTE 4 EAST,,...,20180930,2018.0,Q3,20181029,2018-10-29 08:23:00.0,0,1,alx-20180930.xml,1,
4,0000003545-18-000108,3545,ALICO INC,100.0,US,FL,"FT. MYERS,",33913,10070 DANIELS INTERSTATE COURT STE. 100,,...,20180930,2018.0,FY,20181206,2018-12-06 16:59:00.0,0,1,alco-20180930.xml,1,
5,0000003570-18-000160,3570,CHENIERE ENERGY INC,4924.0,US,TX,HOUSTON,77002,700 MILAM ST.,SUITE 1900,...,20180930,2018.0,Q3,20181108,2018-11-07 17:34:00.0,0,1,lng-20180930.xml,1,
6,0000004127-18-000046,4127,"SKYWORKS SOLUTIONS, INC.",3674.0,US,MA,WOBURN,01801,20 SYLVAN ROAD,,...,20180930,2018.0,FY,20181115,2018-11-14 18:45:00.0,0,1,swks-20180928.xml,1,
7,0000004281-18-000127,4281,ARCONIC INC.,3350.0,US,NY,NEW YORK,10022-4608,390 PARK AVENUE,,...,20180930,2018.0,Q3,20181101,2018-11-01 10:09:00.0,0,1,arnc-20180930.xml,1,
8,0000004457-18-000054,4457,AMERCO /NV/,7510.0,US,NV,RENO,89511,5555 KIETZKE LANE STE 100,,...,20180930,2019.0,Q2,20181107,2018-11-07 16:06:00.0,0,1,uhal-20180930.xml,1,
9,0000004904-18-000055,4904,AMERICAN ELECTRIC POWER CO INC,4911.0,US,OH,COLUMBUS,43215,1 RIVERSIDE PLAZA,,...,20180930,2018.0,Q3,20181025,2018-10-25 17:17:00.0,0,1,aep-20180930.xml,8,81027 73986 92487 1702494 50172 6879 1721781


In [6]:
if not 'dict_of_dfs_num_pre_sub_tag.p' in os.listdir('.'):
    with open('dict_of_dfs_num_pre_sub_tag.p', 'wb') as picklefile:
        pickle.dump(DataFrames, picklefile)
else:
    print('file already pickled')

file already pickled


In [7]:
companies = DataFrames['SUB'][['name', 'stprba']].drop_duplicates()

In [8]:
companies.head(400)

Unnamed: 0,name,stprba
0,"ADAMS RESOURCES & ENERGY, INC.",TX
1,ADVANCED MICRO DEVICES INC,CA
2,AIR PRODUCTS & CHEMICALS INC /DE/,PA
3,ALEXANDERS INC,NJ
4,ALICO INC,FL
5,CHENIERE ENERGY INC,TX
6,"SKYWORKS SOLUTIONS, INC.",MA
7,ARCONIC INC.,NY
8,AMERCO /NV/,NV
9,AMERICAN ELECTRIC POWER CO INC,OH


In [10]:
if not 'companies_df.p' in os.listdir('.'):
    with open('companies_df.p', 'wb') as picklefile:
        pickle.dump(companies, picklefile)
else:
    print('file already pickled')

file already pickled


## Outcome

The working directory now contains the following data files:
1. A folder 'SEC_Datasets' containing the .txt datafiles acquired from the SEC website.
2. A pickle file 'dict_of_dfs_num_pre_sub_tag.p' containing the DICTIONARY of 4 DATAFRAMES of SEC data.  
This will form the input for PHASE II - financial analysis.
3. A pickle file 'companies_df.p' containing the unique pairs of companies and their state present in the SEC data.  
This will form the input for the rest of PHASE I - feature generation.