## 2 - Extract Documents from Each Filing
> Explain Step

### 2.0 - Import Libraries

In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import random
import datetime
import unicodedata
import re
import time
import sys
import pandas_datareader.data as web
import datetime as dt
import math
import gc
import os

In [2]:
# Clear future and deprecation warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()
# Jupyter magic command for progress bar with tqdm module

In [4]:
# Set max columns and width with pandas
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 500)

#### 2.2 - Read In Data
> Which Data

In [5]:
df = pd.read_csv('../filing_links.csv')

In [6]:
df.shape

(126058, 6)

In [7]:
sp_df = pd.read_csv('../clean_data/sp500.csv')

In [8]:
df = df.merge(sp_df[['symbol', 'security', 'gics_sector', 'gics_sub_industry']],
              left_on='company', right_on='symbol', how='left')
df.drop(columns=['symbol'], inplace=True)

In [9]:
df.head(1)

Unnamed: 0,company,date,doc_link,filing_doc,complete_file_link,file_description,security,gics_sector,gics_sub_industry
0,MMM,2019-11-06,http://sec.gov/Archives/edgar/data/66740/000110465919060593/0001104659-19-060593-index.htm,8-K,https://sec.gov/Archives/edgar/data/66740/000110465919060593/0001104659-19-060593.txt,Complete submission text file,3M Company,Industrials,Industrial Conglomerates


#### 2.3 - Drop Data Before 2011
> Explain Step

In [10]:
df = df.loc[df['date'] > '2010-12-31', :]
df.shape

(63659, 9)

#### 2.3 - Define Functions to Clean and Extract Text from Filings
> Explain Step

In [11]:
def restore_windows_1252_characters(restore_string):
    """
        Replace C1 control characters in the Unicode string s by the
        characters at the corresponding code points in Windows-1252,
        where possible.
    """

    def to_windows_1252(match):
        try:
            return bytes([ord(match.group(0))]).decode('windows-1252')
        except UnicodeDecodeError:
            # No character at the corresponding code point: remove it.
            return ''
        
    return re.sub(r'[\u0080-\u0099]', to_windows_1252, restore_string)

In [12]:
# Loop through all links in comple_file_link
def extract_corpus_date(url):
        
    # define a dictionary that will house all filings.
    master_filings_dict = {}

    # grab the response
    res = requests.get(url)

    # parse response
    try:
        soup = BeautifulSoup(res.content, 'lxml')

        # let's use the accession number as the key. This 
        accession_number = url[-24:]

        # add a new level to our master_filing_dict, this will also be a dictionary.
        master_filings_dict[accession_number] = {}

        # this dictionary will contain two keys, the sec header content, and a documents key.
        master_filings_dict[accession_number]['sec_header_content'] = {}
        master_filings_dict[accession_number]['filing_documents'] = None

        # grab the sec-header tag, so we can store it in the master filing dictionary.
        try:
            sec_header_tag = soup.find('sec-header')
            sec_header_tag.get_text()           
        except AttributeError:
            sec_header_tag = None

        master_filings_dict[accession_number]['sec_header_content']['sec_header_code'] = sec_header_tag

        # initalize the dictionary that will house all of our documents
        master_document_dict = {}

        # find all the documents in the filing.
        for filing_document in soup.find_all('document'):

            # define the document type, found under the <type> tag, this will serve as our key for the dictionary.
            document_id = filing_document.find('type').find(text=True, recursive=False).strip()

            document_filename = filing_document.find('filename').find(text=True, recursive=False).strip()

            try:
                document_description = filing_document.description.find(text=True, recursive=False).strip()
            except AttributeError:
                document_description = None

            # initalize our document dictionary
            master_document_dict[document_id] = {}

            # add the different parts, we parsed up above.
            master_document_dict[document_id]['document_filename'] = document_filename
            master_document_dict[document_id]['document_description'] = document_description

            # store the document itself, this portion extracts the HTML code. We will have to reparse it later.
            master_document_dict[document_id]['document_code'] = filing_document.extract()

            # grab the text portion of the document, this will be used to split the document into pages.
            filing_doc_text = filing_document.find('text').extract()

            # find all the thematic breaks, these help define page numbers and page breaks.
            all_thematic_breaks = filing_doc_text.find_all('hr',{'width':'100%'})

            # convert all thematic breaks to a string so it can be used for parsing
            all_thematic_breaks = [str(thematic_break) for thematic_break in all_thematic_breaks]

            # prep the document text for splitting, this means converting it to a string.
            filing_doc_string = str(filing_doc_text)

            # handle the case where there are thematic breaks.
            if len(all_thematic_breaks) > 0:

                # define the regex delimiter pattern, this would just be all of our thematic breaks.
                regex_delimiter_pattern = '|'.join(map(re.escape, all_thematic_breaks))

                # split the document along each thematic break.
                split_filing_string = re.split(regex_delimiter_pattern, filing_doc_string)

                # store the document itself
                master_document_dict[document_id]['pages_code'] = split_filing_string

            # handle the case where there are no thematic breaks.
            elif len(all_thematic_breaks) == 0:

                # handles so it will display correctly.
                split_filing_string = all_thematic_breaks

                # store the document as is, since there are no thematic breaks. In other words, no splitting.
                master_document_dict[document_id]['pages_code'] = [filing_doc_string]

        # store the documents in the master_filing_dictionary.
        master_filings_dict[accession_number]['filing_documents'] = master_document_dict

        #### Normalizing Text

        filing_documents = master_filings_dict[accession_number]['filing_documents']

        # loop through each document
        for document_id in filing_documents:

            # grab all the pages for that document
            document_pages = filing_documents[document_id]['pages_code']

            # page length
            pages_length = len(filing_documents[document_id]['pages_code'])

            # initalize a dictionary that'll house our repaired html code for each page.
            repaired_pages = {}

            # initalize a dictionary that'll house all the normalized text.
            normalized_text = {}

            # loop through each page in that document.
            for index, page in enumerate(document_pages):

                # pass it through the parser. NOTE I AM USING THE HTML5 PARSER. YOU MUST USE THIS TO FIX BROKEN TAGS.
                page_soup = BeautifulSoup(page,'html5lib')

                # grab all the text, notice I go to the BODY tag to do this
                page_text = page_soup.find('html').find('body').get_text(' ',strip = True)

                # normalize the text, remove messy characters. Additionally, restore missing window characters.
                page_text_norm = restore_windows_1252_characters(unicodedata.normalize('NFKD', page_text)) 

                # Additional cleaning steps, removing double spaces, and new line breaks.
                page_text_norm = page_text_norm.replace('  ', ' ').replace('\n',' ')

                # define the page number.
                page_number = index + 1

                # add the normalized text to the list.
                normalized_text[page_number] = page_text_norm

                # add the repaired html to the list. Also now we have a page number as the key.
                repaired_pages[page_number] = page_soup

            # add the normalized text back to the document dictionary
            filing_documents[document_id]['pages_normalized_text'] = normalized_text

            # add the repaired html code back to the document dictionary
            filing_documents[document_id]['pages_code'] = repaired_pages

            # define the generated page numbers
            gen_page_numbers = list(repaired_pages.keys())

            # add the page numbers we have.
            filing_documents[document_id]['pages_numbers_generated'] = gen_page_numbers    

        # - - - - - - - - - - - - - - - - - - - - - - -    
        # Document types to be discarded
        discard_documents = ['GRAPHIC', 'XML', 'JSON', 'EXCEL', 'ZIP']

        # Create string to house all text
        corpus = ''

        # Loop through all different documents in each filing
        for key in master_filings_dict[accession_number]['filing_documents'].keys():

            # Discard documents not wanted
            if key not in discard_documents:

                # Loop through all normalized text in each file
                for innerkey in master_filings_dict[accession_number]['filing_documents'][key] \
                ['pages_normalized_text'].keys():

                    # Merge all text into a string
                    corpus += '|' + master_filings_dict[accession_number]['filing_documents'][key] \
                    ['pages_normalized_text'][innerkey]
        try: 
            # Extract acceptance date from each document.
            acceptance_date = str(master_filings_dict[accession_number]['sec_header_content']['sec_header_code'] \
                              .find('acceptance-datetime'))[21:35]
            # Convert to DateTime
            release_date = datetime.datetime.strptime(acceptance_date,"%Y%m%d%H%M%S")
        except AttributeError:
            release_date = None

    except RecursionError:
        pass
        release_date = None
        corpus = None
    
    except requests.exceptions.ConnectionError:
        time.sleep(50)
    except requests.exceptions.ChunkedEncodingError:
        time.sleep(50)
    time.sleep(.5)
    # Return string
    return release_date, corpus

# Adapted from:
# https://github.com/areed1192/sigma_coding_youtube/tree/master/python/python-finance/sec-web-scraping

#### 2.4 - Extract Clean Text and Release Date from each Filing
> Explain Step

In [13]:
list(['ARE', 'AAPL'])

['ARE', 'AAPL']

In [14]:
# Loop through all links in comple_file_link
for i, ticker in enumerate(tqdm(list(['ARE', 'AAPL']))):
    
    # Select a subset DataFrame (per company)
    ticker_df = df.loc[df['company'] == ticker].copy()
    
    # Print company being analyzed
    print(ticker)
    
    # Create 2 new columns with extracted text and release date of the document
    ticker_df['release_date'], ticker_df['corpus'] = zip(*ticker_df['complete_file_link'].progress_map(extract_corpus_date))
    
    # Save files per company
    ticker_df.to_csv(f'../data/{ticker}.csv', index_label=False) 

    # Delete subset DataFrame
    del ticker_df
    
    # Clear memmory every 50 rows with Garbage Collector
    if i % 50 == 0:
        gc.collect()

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

ARE


HBox(children=(IntProgress(value=0, max=129), HTML(value='')))


AAPL


HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

KeyboardInterrupt: 

#### 2.5 - Concatenate File Chunks per Company
> Explain Step

In [5]:
file_path = '../data/'

df = pd.DataFrame()
for file in os.listdir(file_path):
    if file.endswith('.csv'):
        df = pd.concat([df, pd.read_csv('../data/' + file)], axis=0)

df.reset_index(inplace=True)
df.to_csv('../clean_data/concat.csv', index_label=False)