In [32]:
import re
import requests
import unicodedata
from bs4 import BeautifulSoup

In [33]:
#define the endpoint
endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"

# define parameters
param_dict = {"action": "getcompany",
             "CIK": "AMZN",
             "type": "10-k",
             "owner": "exclude",
             "output": "atom"}

# define response
response = requests.get(url = endpoint, params = param_dict)
soup = BeautifulSoup(response.content, "lxml")

# find url to get to most recent 10-K filings
new_url = soup.find("entry").find("filing-href").text.strip().replace("-index.htm", ".txt")

response = requests.get(url = new_url)
soup = BeautifulSoup(response.content, "lxml")

In [34]:
# Function to normalize the text
def restore_windows_1252_characters(restore_string):
    def to_windows_1252(match):
        try:
            return bytes([ord(match.group(0))]).decode("windows-1252")
        except UnicodeDecodeError:
            return " "
    
    return re.sub(r'[\u0080-\u0099]', to_windows_1252, restore_string)

In [35]:
# use the url to go to the specific text file
new_html_text = new_url

# get the accession number
url_split = new_html_text.split("/")
accession_number = url_split[-1].replace(".txt", "")

# grab the response
response = requests.get(new_html_text)

# parse the response
soup = BeautifulSoup(response.content, "lxml")

In [36]:
# define a new dictionary to house filings
master_filings_dict = {}

# add the key to the dict and add a new level
master_filings_dict[accession_number] = {}

# add the next levels
master_filings_dict[accession_number]["sec_header_content"] = {}
master_filings_dict[accession_number]["filing_documtents"] = None

In [37]:
# grab the sec-header document
sec_header_tag = soup.find('sec-header')

#store the sec header content inside the dictionary
master_filings_dict[accession_number]["sec_header_content"]["sec_header_code"] = sec_header_tag

In [38]:
# initialize master document dictionary
master_document_dict = {}

# loop through each document in the filing
for filing_document in soup.find_all('document'):
    
    # define my document id
    document_id = filing_document.type.find(text=True, recursive = False).strip()
    print(document_id)
    
    if document_id != "10-K":
        break
    
    # document sequence
    document_sequence = filing_document.sequence.find(text=True, recursive = False).strip()
    
    # document filename
    document_filename = filing_document.filename.find(text=True, recursive = False).strip()
    
    # document description
    document_description = filing_document.description.find(text=True, recursive = False).strip()
    
    # insert the key
    master_document_dict[document_id] = {}
    
    # add the different parts of the document
    master_document_dict[document_id]["document_sequence"] = document_sequence
    master_document_dict[document_id]["document_filename"] = document_filename
    master_document_dict[document_id]["document_description"] = document_description
    
    # add the document content itself
    master_document_dict[document_id]["document_code"] = filing_document.extract()
    
    # get all the text in the document
    filing_doc_text = filing_document.find("text").extract()
    
    # get all thematic breaks
    all_thematic_breaks = filing_doc_text.find_all("hr",{"style": "page-break-after:always"})
    
    # convert all the breaks into strings
    all_thematic_breaks = [str(thematic_break) for thematic_break in all_thematic_breaks]
    
    # prep the document for being split
    filing_doc_string = str(filing_doc_text)
    
    if len(all_thematic_breaks) > 0:
        
        # creates our pattern
        regex_delimited_pattern = "|".join(map(re.escape, all_thematic_breaks))
        
        # split the document along the thematic breaks
        split_filing_string = re.split(regex_delimited_pattern, filing_doc_string)
        
        # store the document in the dictionary
        master_document_dict[document_id]["pages_code"] = split_filing_string
        
    elif len(all_thematic_breaks) == 0:
        master_document_dict[document_id]["pages_code"] = [filing_doc_string]
        
# store the documents in the master_filing_dictionary
master_filings_dict[accession_number]["filing_documents"] = master_document_dict

10-K
EX-4.6


In [39]:
# first grab all the documents
filing_documents = master_filings_dict[accession_number]["filing_documents"]

# loop through each document
for document_id in filing_documents:
    
    # grab all the pages for each document
    document_pages = filing_documents[document_id]["pages_code"]
    
    # page length
    pages_length = len(document_pages)
    
    # initialize some dictionaries
    repaired_pages = {}
    normalized_text = {}
    
    for index, page in enumerate(document_pages):
        
        # pass it through the parser to repair it
        page_soup = BeautifulSoup(page, "html5")
        
        # grab the text from each page
        page_text = page_soup.html.body.get_text(" ", strip = True)
        
        # normalize the text
        page_text_norm = restore_windows_1252_characters(unicodedata.normalize("NFKD", page_text))
        
        # additional cleaning
        page_text_norm = page_text_norm.replace("  ", " ").replace("\n", " ")
        
        # define our page number
        page_number = index+1
        
        # add normalized text to the dictionary
        normalized_text[page_number] = page_text_norm
        
        # add the repaired html code to the dictionary
        repaired_pages[page_number] = page_soup
    
    # add the normalized text dictionary to the master filing dictionary
    filing_documents[document_id]["page_normalized_text"] = normalized_text
    
    # add the repaired pages to the master filing dictionary
    filing_documents[document_id]["pages_code"] = repaired_pages
    
    # add the page numbers we generate
    gen_page_numbers = list(repaired_pages.keys())
    
    filing_documents[document_id]["page_numbers_generated"] = gen_page_numbers

master_filings_dict[accession_number]["filing_documents"] = filing_documents

In [40]:
master_filings_dict[accession_number]["filing_documents"]["10-K"]["page_normalized_text"][45]

'Table of Contents Other Income (Expense), Net Other income (expense), net, consists primarily of adjustments to and gains on equity securities of $ 18 million , $ 145 million , and $ 231 million in 2017 , 2018 , and 2019 , equity warrant valuation gains (losses) of $ 109 million , $( 131 ) million , and $ 11 million in 2017 , 2018 , and 2019 , and foreign currency gains (losses) of $ 247 million , $( 206 ) million , and $( 20 ) million in 2017 , 2018 , and 2019 . Income Taxes Income tax expense includes U.S. (federal and state) and foreign income taxes. Certain foreign subsidiary earnings are subject to U.S. taxation under the U.S. Tax Act, which also repeals U.S. taxation on the subsequent repatriation of those earnings. We intend to invest substantially all of our foreign subsidiary earnings, as well as our capital in our foreign subsidiaries, indefinitely outside of the U.S. in those jurisdictions in which we would incur significant, additional costs upon repatriation of such amoun

In [41]:
new_url

'https://www.sec.gov/Archives/edgar/data/1018724/000101872420000004/0001018724-20-000004.txt'