In [None]:
# libraries
import requests
import urllib
from bs4 import BeautifulSoup
import json
import numpy as np

#URL maker (standardized)
def make_url(base_url , comp):
    
    url = base_url
    
    # add each component to the base url
    for r in comp:
        url = '{}/{}'.format(url, r)
        
    return url

# daily index base url
base_url = r"https://www.sec.gov/Archives/edgar/daily-index"

# array of years
years = np.arange(2018, 2019, 1).tolist()

#count variable to count each master. outputs
count=0

# File to write outputs
FILE = open('SECmasterURLs.txt', 'w')

# Loop for Daily-index filings, requires an array of years and 
# content type (html, json, or xml).
for year in years:
    year_url = make_url(base_url, [year, 'index.json'])
    
    # Display the new Year URL 
    print('-'*100)
    print('Building the URL for Year: {}'.format(year))
    print("URL Link: " + year_url)
    
    #reauest content for each year, JSON will be send back so we need to decode it.
    content = requests.get(year_url)
    decoded_content = content.json()
    
    #go to item list
    for item in decoded_content['directory']['item'][0:4]:
        # get the name of the folder
        print('-'*100)
        print('Pulling url for Quarter: {}'.format(item['name']) + 'for Year:{}'.format(year))
        
        # The daily-index filings, require an array of years, a quarter and a 
        # content type (html, json, or xml).
        qtr_url = make_url(base_url, [year, item['name'], 'index.json'])
        
        # print out the url.
        print("URL Link: " + qtr_url)
        
        # Request the new url and again it will be a JSON structure so decode it.
        file_content = requests.get(qtr_url)
        decoded_content = file_content.json()
        
        print('-'*100)
        print('Pulling files for Year: {} '.format(year) + 'Quarter: {}'.format(item['name']))
     
        # for each file containing "master." in items list, print the file type and 
        # file href. We only get ".master" since it contains Company name, CIK number, 
        # File Type (10-K, 10-Q, 3, 4, 5 etc.)
        # Write it in to a .txt
        
        for file in decoded_content['directory']['item']:
            if 'master.' in file['name']:
                file_url = make_url(base_url, [year, item['name'], file['name']])
                print("File URL Link: " + file_url)
                FILE.write(file_url)
                FILE.write("\n")
                count = count+1
print (count)
               
        

In [None]:
file_url = r"https://www.sec.gov/Archives/edgar/daily-index/2019/QTR2/master.20190402.idx"

urlsplit = file_url.split('/')
urlsplit2 = urlsplit[8].split('.')
filenamebuilder = '{}{}'.format(urlsplit2[0],urlsplit2[1] + '.txt')

MasterFiles = open('parsed' + filenamebuilder, 'w')

content = requests.get(file_url).content


with open(filenamebuilder, 'wb') as f:
     f.write(content)
# let's open it and we will now have a byte stream to play with.
with open(filenamebuilder,'rb') as f:
     byte_data = f.read()

# Now that we loaded the data, we have a byte stream that needs to be decoded and then split by double spaces.
data = byte_data.decode("utf-8").split('----')

# We need to remove the headers, so look for the end of the header and grab it's index
for index, item in enumerate(data):
    if "ftp://ftp.sec.gov/edgar/" in item:
        start_ind = index

# define a new dataset with out the header info.
data_format = data[start_ind + 1:]

master_data = []

# now we need to break the data into sections, this way we can move to the final step of getting each row value.
for index, item in enumerate(data_format):
    
    # if it's the first index, it won't be even so treat it differently
    if index == 0:
        clean_item_data = item.replace('\n','|').split('|')
        clean_item_data = clean_item_data[8:]
    else:
        clean_item_data = item.replace('\n','|').split('|')
        
    for index, row in enumerate(clean_item_data):
        
        # when you find the text file.
        if '.txt' in row:

            # grab the values that belong to that row. It's 4 values before and one after.
            mini_list = clean_item_data[(index - 4): index + 1]
            
            if len(mini_list) != 0:
                mini_list[4] = "https://www.sec.gov/Archives/" + mini_list[4]
                master_data.append(mini_list)
                


#loop through each document in the master list.
for index, document in enumerate(master_data):
    
    # create a dictionary for each document in the master list
    document_dict = {}
    document_dict['cik_number'] = document[0]
    document_dict['company_name'] = document[1]
    document_dict['form_id'] = document[2]
    document_dict['date'] = document[3]
    document_dict['file_url'] = document[4]
    
    master_data[index] = document_dict


for document_dict in master_data:

    # if it's a 10-K document pull the url and the name.
    if document_dict['form_id'] == '10-K':
        
        # get the components
        comp_name = document_dict['company_name']
        docu_url = document_dict['file_url']
        form_type = document_dict['form_id']
        
        print('-'*100)
        print(comp_name)
        print(docu_url)
        print('Form Type is: {}'.format(form_type))
        MasterFiles.write('-'*75)
        MasterFiles.write('\n')
        MasterFiles.write(comp_name)
        MasterFiles.write('\n')
        MasterFiles.write(docu_url)
        MasterFiles.write('\n')
        MasterFiles.write(form_type)
        MasterFiles.write('\n')
       
        

for document_dict in master_data:

    # if it's a 10-Q document pull the url and the name.
    if document_dict['form_id'] == '10-Q':
        
        # get the components
        comp_name = document_dict['company_name']
        docu_url = document_dict['file_url']
        form_type = document_dict['form_id']
        
        print('-'*100)
        print(comp_name)
        print(docu_url)
        print('Form Type is: {}'.format(form_type))
        MasterFiles.write('-'*75)
        MasterFiles.write('\n')
        MasterFiles.write(comp_name)
        MasterFiles.write('\n')
        MasterFiles.write(docu_url)
        MasterFiles.write('\n')
        MasterFiles.write(form_type)
        MasterFiles.write('\n')
        
        

for document_dict in master_data:

    # if it's a NT 10-K document pull the url and the name.
    if document_dict['form_id'] == 'NT 10-K':
        
        # get the components
        comp_name = document_dict['company_name']
        docu_url = document_dict['file_url']
        form_type = document_dict['form_id']
        
        print('-'*100)
        print(comp_name)
        print(docu_url)
        print('Form Type is: {}'.format(form_type))
        MasterFiles.write('-'*75)
        MasterFiles.write('\n')
        MasterFiles.write(comp_name)
        MasterFiles.write('\n')
        MasterFiles.write(docu_url)
        MasterFiles.write('\n')
        MasterFiles.write(form_type)
        MasterFiles.write('\n')
       
        

for document_dict in master_data:

    # if it's a NT 10-K document pull the url and the name.
    if document_dict['form_id'] == 'NT 10-Q':
        
        # get the components
        comp_name = document_dict['company_name']
        docu_url = document_dict['file_url']
        form_type = document_dict['form_id']
        
        print('-'*100)
        print(comp_name)
        print(docu_url)
        print('Form Type is: {}'.format(form_type))
        MasterFiles.write('-'*75)
        MasterFiles.write('\n')
        MasterFiles.write(comp_name)
        MasterFiles.write('\n')
        MasterFiles.write(docu_url)
        MasterFiles.write('\n')
        MasterFiles.write(form_type)
        MasterFiles.write('\n')
    

        
