# PHASE 1: DATA ACQUISITION AND INGESTION
## DATA ACQUISITION

The purpose of this module is to  
1. Download the Financial Statement Data Sets from the SEC website: https://www.sec.gov/dera/data/financial-statement-data-sets.html  
2. Unzip the data and store locally.

In [13]:
# to scrape the SEC page
from bs4 import BeautifulSoup

# to download the zip file data
import requests

# to extract the zip files
import zipfile

# to interpret the zip file stream from requests
# https://stackoverflow.com/questions/9419162/download-returned-zip-file-from-url
import io

# to work with paths
import os

In [14]:
# Open the SEC website
page_url = 'https://www.sec.gov/dera/data/financial-statement-data-sets.html'
r = requests.get(page_url)
html_doc = r.text

def SEC_files_stored(yr_qtr):
    '''Checks whether all four files have been extracted
    to the appropriate directory ./2015q3/num.txt.
    Returns True if all four files are present for the yr_qtr.'''
    
    data_files = ['num.txt', 'pre.txt', 'sub.txt', 'tag.txt']
    results = []
    for data_file in data_files:
        if os.path.isfile('./'+yr_qtr+'/'+data_file): results.append(True)
    
    if all(result == True for result in results):
        return True
    else:
        return False
    
soup = BeautifulSoup(html_doc)
# Find all hyperlinks pointing to .zip files
for a in soup.find_all('a', href=True):
    if '.zip' in a['href']:
        # The hrefs are relative to the domain: add domain.
        zip_file_url = 'https://www.sec.gov'+a['href']
        year_quarter = zip_file_url[-10:-4]
        
        # Stream the file and print status update. Then, extract the zip file.
        zip_data = requests.get(zip_file_url, stream=True)
        with zipfile.ZipFile(io.BytesIO(zip_data.content), 'r') as zip_data_file:
            zip_data_file.extractall('./'+year_quarter)
        if SEC_files_stored(year_quarter) == False:
            print("WARNING: Not all files have been stored for "+year_quarter)
        print('Extracted '+zip_file_url)    

        
# This now works as expected. 
# However, worthwhile to write to Pandas directly?
# Else, generate and write dict receipts for all TXT files written for easy import in next module.

Opened stream to https://www.sec.gov/files/dera/data/financial-statement-data-sets/2018q4.zip
Extracted https://www.sec.gov/files/dera/data/financial-statement-data-sets/2018q4.zip
Opened stream to https://www.sec.gov/files/dera/data/financial-statement-data-sets/2018q3.zip
Extracted https://www.sec.gov/files/dera/data/financial-statement-data-sets/2018q3.zip
Opened stream to https://www.sec.gov/files/dera/data/financial-statement-data-sets/2018q2.zip
Extracted https://www.sec.gov/files/dera/data/financial-statement-data-sets/2018q2.zip
Opened stream to https://www.sec.gov/files/dera/data/financial-statement-data-sets/2018q1.zip


KeyboardInterrupt: 