In [127]:
import re
import requests
import unicodedata
from bs4 import BeautifulSoup

In [137]:
#define the endpoint
endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"

# define parameters
param_dict = {"action": "getcompany",
             "CIK": "ULTA",
             "type": "10-k",
             "owner": "exclude",
             "output": "atom"}

# define response
response = requests.get(url = endpoint, params = param_dict)
soup = BeautifulSoup(response.content, "lxml")

# find url to get to most recent 10-K filings
new_url = soup.find("entry").find("filing-href").text.strip().replace("-index.htm", ".txt")

response = requests.get(url = new_url)
soup = BeautifulSoup(response.content, "lxml")

In [138]:
# Function to normalize the text
def restore_windows_1252_characters(restore_string):
    def to_windows_1252(match):
        try:
            return bytes([ord(match.group(0))]).decode("windows-1252")
        except UnicodeDecodeError:
            return " "
    
    return re.sub(r'[\u0080-\u0099]', to_windows_1252, restore_string)

In [139]:
# use the url to go to the specific text file
new_html_text = new_url

# get the accession number
url_split = new_html_text.split("/")
accession_number = url_split[-1].replace(".txt", "")

# grab the response
response = requests.get(new_html_text)

# parse the response
soup = BeautifulSoup(response.content, "lxml")

In [140]:
# define a new dictionary to house filings
master_filings_dict = {}

# add the key to the dict and add a new level
master_filings_dict[accession_number] = {}

# add the next levels
master_filings_dict[accession_number]["sec_header_content"] = {}
master_filings_dict[accession_number]["filing_documtents"] = None

In [141]:
# grab the sec-header document
sec_header_tag = soup.find('sec-header')

#store the sec header content inside the dictionary
master_filings_dict[accession_number]["sec_header_content"]["sec_header_code"] = sec_header_tag

In [142]:
# initialize master document dictionary
master_document_dict = {}

# loop through each document in the filing
for filing_document in soup.find_all('document'):
    
    # define my document id
    document_id = filing_document.type.find(text=True, recursive = False).strip()
    print(document_id)
    
    if document_id[:4] != "10-K":
        break
    
    # document sequence
    document_sequence = filing_document.sequence.find(text=True, recursive = False).strip()
    
    # document filename
    document_filename = filing_document.filename.find(text=True, recursive = False).strip()
    
    # document description
    document_description = filing_document.description.find(text=True, recursive = False).strip()
    
    # insert the key
    master_document_dict[document_id] = {}
    
    # add the different parts of the document
    master_document_dict[document_id]["document_sequence"] = document_sequence
    master_document_dict[document_id]["document_filename"] = document_filename
    master_document_dict[document_id]["document_description"] = document_description
    
    # add the document content itself
    master_document_dict[document_id]["document_code"] = filing_document.extract()
    
    # get all the text in the document
    filing_doc_text = filing_document.find("text").extract()
    
    # get all thematic breaks
    #use try, except to check for multiple formats
    
    try:
        all_thematic_breaks = filing_doc_text.find_all("hr")
        print(all_thematic_breaks[0])
    except Exception:
        # trying another possible format
        try:
            all_thematic_breaks = filing_doc_text.find_all(attrs={"style": "page-break-after:always"})
            print(all_thematic_breaks[0])
        except Exception:
            # trying another format
            try: 
                all_thematic_breaks = filing_doc_text.find_all(attrs={"style": "page-break-after: always"})
                print(all_thematic_breaks[0])
            except Exception:
                # trying another format
                try:
                    all_thematic_breaks = filing_doc_text.find_all(attrs={"style": "page-break-after: always;"})
                    print(all_thematic_breaks[0])
                except Exception:
                    # trying another format
                    try:
                        all_thematic_breaks = filing_doc_text.find_all("a", text = re.compile('Table of Contents'))
                        print(all_thematic_breaks[0])
                    except Exception:
                        print("weird page break format, check HTML")
                        break
    
    # convert all the breaks into strings
    all_thematic_breaks = [str(thematic_break) for thematic_break in all_thematic_breaks]
    
    # prep the document for being split
    filing_doc_string = str(filing_doc_text)
    
    if len(all_thematic_breaks) > 0:
        
        # creates our pattern
        regex_delimited_pattern = "|".join(map(re.escape, all_thematic_breaks))
        
        # split the document along the thematic breaks
        split_filing_string = re.split(regex_delimited_pattern, filing_doc_string)
        
        # store the document in the dictionary
        master_document_dict[document_id]["pages_code"] = split_filing_string
        
    elif len(all_thematic_breaks) == 0:
        master_document_dict[document_id]["pages_code"] = [filing_doc_string]
        
# store the documents in the master_filing_dictionary
master_filings_dict[accession_number]["filing_documents"] = master_document_dict

10-K
<a href="#Toc"><span style="font-family:'Times New Roman';font-size:10pt;font-style:normal;font-weight:normal;text-align:left;">Table of Contents</span></a>
EX-4


In [143]:
# first grab all the documents
filing_documents = master_filings_dict[accession_number]["filing_documents"]

# loop through each document
for document_id in filing_documents:
    
    # grab all the pages for each document
    document_pages = filing_documents[document_id]["pages_code"]
    
    # page length
    pages_length = len(document_pages)
    
    # initialize some dictionaries
    repaired_pages = {}
    normalized_text = {}
    
    for index, page in enumerate(document_pages):
        
        # pass it through the parser to repair it
        page_soup = BeautifulSoup(page, "html5")
        
        # grab the text from each page
        page_text = page_soup.html.body.get_text(" ", strip = True)
        
        # normalize the text
        page_text_norm = restore_windows_1252_characters(unicodedata.normalize("NFKD", page_text))
        
        # additional cleaning
        page_text_norm = page_text_norm.replace("  ", " ").replace("\n", " ")
        
        # define our page number
        page_number = index+1
        
        # add normalized text to the dictionary
        normalized_text[page_number] = page_text_norm
        
        # add the repaired html code to the dictionary
        repaired_pages[page_number] = page_soup
    
    # add the normalized text dictionary to the master filing dictionary
    filing_documents[document_id]["page_normalized_text"] = normalized_text
    
    # add the repaired pages to the master filing dictionary
    filing_documents[document_id]["pages_code"] = repaired_pages
    
    # add the page numbers we generate
    gen_page_numbers = list(repaired_pages.keys())
    
    filing_documents[document_id]["page_numbers_generated"] = gen_page_numbers

master_filings_dict[accession_number]["filing_documents"] = filing_documents

In [149]:
master_filings_dict[accession_number]["filing_documents"]["10-K"]["page_normalized_text"][4]

'FORWARD-LOOKING STATEMENTS References in this Annual Report on Form 10-K to “we,” “us,” “our,” “Ulta Beauty,” the “Company” and similar references mean Ulta Beauty, Inc. and its consolidated subsidiaries, unless otherwise expressly stated or the context otherwise requires. This Annual Report on Form 10-K contains forward-looking statements within the meaning of Section 21E of the Securities Exchange Act of 1934, as amended, and the safe harbor provisions of the Private Securities Litigation Reform Act of 1995, which reflect our current views with respect to, among other things, future events and financial performance. You can identify these forward-looking statements by the use of forward-looking words such as “outlook,” “believes,” “expects,” “plans,” “estimates,” “targets,” “strategies” or other comparable words. Any forward-looking statements contained in this Form 10-K are based upon our historical performance and on current plans, estimates, and expectations. The inclusion of thi

In [136]:
new_url

'https://www.sec.gov/Archives/edgar/data/1403568/000155837020003272/0001558370-20-003272.txt'