In [74]:
import re
import requests
import unicodedata
from bs4 import BeautifulSoup

In [75]:
#define the endpoint
endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"

# define parameters
param_dict = {"action": "getcompany",
             "CIK": "MSFT",
             "type": "10-k",
             #"dateb": "20190101",
             "owner": "exclude",
             "start": "",
             "output": "atom",
             "count": "100"}

# define response
response = requests.get(url = endpoint, params = param_dict)
soup = BeautifulSoup(response.content, "lxml")

<?xml version="1.0" encoding="ISO-8859-1" ?><html><body><feed xmlns="http://www.w3.org/2005/Atom">
<author>
<email>webmaster@sec.gov</email>
<name>Webmaster</name>
</author>
<company-info>
<addresses>
<address type="mailing">
<city>REDMOND</city>
<state>WA</state>
<street1>ONE MICROSOFT WAY</street1>
<zip>98052-6399</zip>
</address>
<address type="business">
<city>REDMOND</city>
<phone>425-882-8080</phone>
<state>WA</state>
<street1>ONE MICROSOFT WAY</street1>
<zip>98052-6399</zip>
</address>
</addresses>
<assigned-sic>7372</assigned-sic>
<assigned-sic-desc>SERVICES-PREPACKAGED SOFTWARE</assigned-sic-desc>
<assigned-sic-href>https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&amp;SIC=7372&amp;owner=exclude&amp;count=100</assigned-sic-href>
<cik>0000789019</cik>
<cik-href>https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&amp;CIK=0000789019&amp;owner=exclude&amp;count=100</cik-href>
<conformed-name>MICROSOFT CORP</conformed-name>
<fiscal-year-end>0630</fiscal-year-end>
<o

In [76]:
# Function to normalize the text
def restore_windows_1252_characters(restore_string):
    def to_windows_1252(match):
        try:
            return bytes([ord(match.group(0))]).decode("windows-1252")
        except UnicodeDecodeError:
            return " "
    
    return re.sub(r'[\u0080-\u0099]', to_windows_1252, restore_string)

In [77]:
# define the url to a specific text file
new_html_text = r"https://www.sec.gov/Archives/edgar/data/789019/000156459020034944/0001564590-20-034944.txt"

# get the accession number
url_split = new_html_text.split("/")
accession_number = url_split[-1].replace(".txt", "")

# grab the response
response = requests.get(new_html_text)

# parse the response
soup = BeautifulSoup(response.content, "lxml")

'0001564590-20-034944'

In [78]:
# define a new dictionary to house filings
master_filings_dict = {}

# add the key to the dict and add a new level
master_filings_dict[accession_number] = {}

# add the next levels
master_filings_dict[accession_number]["sec_header_content"] = {}
master_filings_dict[accession_number]["filing_documtents"] = None

In [79]:
# grab the sec-header document
sec_header_tag = soup.find('sec-header')

#store the sec header content inside the dictionary
master_filings_dict[accession_number]["sec_header_content"]["sec_header_code"] = sec_header_tag

In [80]:
# initialize master document dictionary
master_document_dict = {}

# loop through each document in the filing
for filing_document in soup.find_all('document'):
    
    # define my document id
    document_id = filing_document.type.find(text=True, recursive = False).strip()
    
    if document_id == "XML":
        break
    
    # document sequence
    document_sequence = filing_document.sequence.find(text=True, recursive = False).strip()
    
    # document filename
    document_filename = filing_document.filename.find(text=True, recursive = False).strip()
    
    # document description
    document_description = filing_document.description.find(text=True, recursive = False).strip()
    
    # insert the key
    master_document_dict[document_id] = {}
    
    # add the different parts of the document
    master_document_dict[document_id]["document_sequence"] = document_sequence
    master_document_dict[document_id]["document_filename"] = document_filename
    master_document_dict[document_id]["document_description"] = document_description
    
    # add the document content itself
    master_document_dict[document_id]["document_code"] = filing_document.extract()
    
    # get all the text in the document
    filing_doc_text = filing_document.find("text").extract()
    
    # get all thematic breaks
    all_thematic_breaks = filing_doc_text.find_all("hr",{"style": "page-break-after:always"})
    
    # convert all the breaks into strings
    all_thematic_breaks = [str(thematic_break) for thematic_break in all_thematic_breaks]
    
    # prep the document for being split
    filing_doc_string = str(filing_doc_text)
    
    if len(all_thematic_breaks) > 0:
        
        # creates our pattern
        regex_delimited_pattern = "|".join(map(re.escape, all_thematic_breaks))
        
        # split the document along the thematic breaks
        split_filing_string = re.split(regex_delimited_pattern, filing_doc_string)
        
        # store the document in the dictionary
        master_document_dict[document_id]["pages_code"] = split_filing_string
        
    elif len(all_thematic_breaks) == 0:
        master_document_dict[document_id]["pages_code"] = [filing_doc_string]
        
# store the documents in the master_filing_dictionary
master_filings_dict[accession_number]["filing_documents"] = master_document_dict

10-K
EX-10.25
EX-10.26
EX-21
EX-23.1
EX-31.1
EX-31.2
EX-32.1
EX-32.2
EX-101.SCH
EX-101.CAL
EX-101.DEF
EX-101.LAB
EX-101.PRE
XML


In [81]:
# first grab all the documents
filing_documents = master_filings_dict[accession_number]["filing_documents"]

# loop through each document
for document_id in filing_documents:
    
    # grab all the pages for each document
    document_pages = filing_documents[document_id]["pages_code"]
    
    # page length
    pages_length = len(document_pages)
    
    # initialize some dictionaries
    repaired_pages = {}
    normalized_text = {}
    
    for index, page in enumerate(document_pages):
        
        # pass it through the parser to repair it
        page_soup = BeautifulSoup(page, "html5")
        
        # grab the text from each page
        page_text = page_soup.html.body.get_text(" ", strip = True)
        
        # normalize the text
        page_text_norm = restore_windows_1252_characters(unicodedata.normalize("NFKD", page_text))
        
        # additional cleaning
        page_text_norm = page_text_norm.replace("  ", " ").replace("\n", " ")
        
        # define our page number
        page_number = index+1
        
        # add normalized text to the dictionary
        normalized_text[page_number] = page_text_norm
        
        # add the repaired html code to the dictionary
        repaired_pages[page_number] = page_soup
    
    # add the normalized text dictionary to the master filing dictionary
    filing_documents[document_id]["page_normalized_text"] = normalized_text
    
    # add the repaired pages to the master filing dictionary
    filing_documents[document_id]["pages_code"] = repaired_pages
    
    # add the page numbers we generate
    gen_page_numbers = list(repaired_pages.keys())
    
    filing_documents[document_id]["page_numbers_generated"] = gen_page_numbers

master_filings_dict[accession_number]["filing_documents"] = filing_documents

In [94]:
master_filings_dict[accession_number]["filing_documents"]["10-K"]["page_normalized_text"][45]

'PART II Item 7 Fiscal Year 2019 Compared with Fiscal Year 2018 Interest and dividends income increased primarily due to higher yields on fixed-income securities. Interest expense decreased primarily driven by a decrease in outstanding long-term debt due to debt maturities, offset in part by higher finance lease expense . Net recognized gains on investments decreased primarily due to lower gains on sales of equity investments. Net gains on derivatives includes gains on foreign exchange and interest rate derivatives in the current period as compared to losses in the prior period . INCOME TAXES Effective Tax Rate Fiscal Year 2020 Compared with Fiscal Year 2019 Our effective tax rate for fiscal years 2020 and 2019 was 17% and 10%, respectively. The increase in our effective tax rate for fiscal year 2020 compared to fiscal year 2019 was primarily due to a $2.6 billion net income tax benefit in the fourth quarter of fiscal year 2019 related to intangible property transfers. Our effective ta