# Part II. Text Preprocessing 

#### **Stage 1:** Identifying MD&A Sections
- Constructing regex patterns to locate the start and end position of the Management's Discussion and Analysis (MD&A) sections in the financial reports
- Testing these on the selected samples 
- Storing these positions associated with each file in a data structure that makes sense
- Applying on all data collected

#### **Stage 2:** Preprocessing texts in MD&A 
- Splitting into sentences - EDA statistics: sentence length, most frequent words, word clouds, etc. 
- Filtering sentences containing key words (revenue, growth, price, cost, profit / driven by, due to, etc.)
- Storing the relevant sentences (?) in a separate data file with position pointers linking to the original reports (?)

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import time
import os

pd.set_option('max_colwidth',None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 0)


In [132]:
path = "C:/Users/clair/Desktop/Thesis/masterThesis2022/Data/Samples/"

file_name = "GOOG/10-Q/0001652044-21-000057/filing-details.html" 
file_html = path+file_name

# Use Beautiful Soup to process the HTML docs

with open(file_html) as f:
    soup_html = BeautifulSoup(f, 'html.parser')


soup_text = soup_html.get_text(strip=True)
soup_text =soup_text.replace('\n',' ').replace('\xa0',' ')

print(soup_html.title)
print('Raw len = {}, vs clean len = {} / {}'.format(len(soup_html.text), len(soup_html.get_text(strip=True)),len(soup_text)))


# Patterns to locate the MD&A Section - 1st Iteration

pattern_start = re.compile(r"Item\s?\d?.?\s*Management[’|']s Discussion and Analysis of Financial Condition and Results of Operations",flags=re.IGNORECASE)#Management's Discussion and Analysis of")
pattern_end = re.compile(r"[Item\s?\d[A-Z]?.?\s*]?Quantitative and Qualitative Disclosures about Market",flags=re.IGNORECASE)

print("Starting matches:")
starts = re.finditer(pattern_start, soup_text)
for i in starts:
    print(i)

print("\nEnding matches:")
ends = re.finditer(pattern_end, soup_text)
for i in ends:
    print(i)

starts = re.finditer(pattern_start, soup_text)
start = [*starts][1].start()

ends = re.finditer(pattern_end, soup_text)
end = [*ends][1].start()

print('\n\nStarting position = {}\n {} \n...\n {}\nEnding posisition ={}\n\n{}'.format(start, soup_text[start:end][:500],soup_text[start:end][-500:],end,soup_text[end:end+100]))

<title>goog-20210930</title>
Raw len = 182810, vs clean len = 180910 / 180910
Starting matches:
<re.Match object; span=(37834, 37925), match='Item 2Management’s Discussion and Analysis of Fin>
<re.Match object; span=(117959, 118051), match='ITEM 2.MANAGEMENT’S DISCUSSION AND ANALYSIS OF FI>

Ending matches:
<re.Match object; span=(37931, 37986), match=' 3Quantitative and Qualitative Disclosures About >
<re.Match object; span=(170658, 170713), match='3.QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT >
<re.Match object; span=(170939, 170995), match='or quantitative and qualitative disclosures about>
<re.Match object; span=(171026, 171082), match='A, Quantitative and Qualitative Disclosures About>


Starting position = 117959
 ITEM 2.MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONSPlease read the following discussion and analysis of our financial condition and results of operations together with our consolidated financial statements and related notes in

In [187]:
file_name = "YUM/10-K/0001041061-01-500003/filing-details.html" 
#file_name = "GOOG/10-Q/0001652044-21-000057/filing-details.html" 
#file_name = "GOOG/10-K/0001652044-16-000012/filing-details.html" 

file_html = path+file_name

# Modified Patterns to locate the MD&A Section - 2nd Iteration

pattern_start = re.compile(r"Item\s?\d?.?\s*Management[’|']s Discussion and Analysis of Financial Condition and Results of Operations",flags=re.IGNORECASE)#Management's Discussion and Analysis of")
pattern_end = re.compile(r"Item\s?\d[A-Z]?.?\s*?Quantitative and Qualitative Disclosures about Market",flags=re.IGNORECASE)


with open(file_html) as f:
    soup_html = BeautifulSoup(f, 'html.parser')

soup_text = soup_html.get_text(strip=True)
soup_text =soup_text.replace('\n',' ').replace('\xa0',' ')

print(soup_html.title)
#print('Raw len = {}, vs clean len = {} / {}'.format(len(soup_html.text), len(soup_html.get_text(strip=True)),len(soup_text)))


print("Starting matches:")
starts = re.finditer(pattern_start, soup_text)
for i in starts:
    print(i)

print("\nEnding matches:")
ends = re.finditer(pattern_end, soup_text)
for i in ends:
    print(i)

starts = [*re.finditer(pattern_start, soup_text)]
print('len(starts)=',len(starts))

if len(starts)>1:
    start = starts[1].start()
elif len(starts) ==1:
    start = starts[0].start()
else:
    pattern_start = re.compile(r"Management[’|']s Discussion and Analysis of Financial Condition and Results of Operations",flags=re.IGNORECASE)#Management's Discussion and Analysis of")
    
    print("Starting matches:")
    starts = re.finditer(pattern_start, soup_text)
    for i in starts:
        print(i)

    starts = [*re.finditer(pattern_start, soup_text)]
    print('len(starts)=',len(starts))

    if len(starts) ==1:
        start = starts[0].start()


ends = [*re.finditer(pattern_end, soup_text)]
print('len(ends)=',len(ends))

if len(ends)>1:
    end = ends[1].start()
elif len(ends) ==1:
    end = ends[0].start()
else:
    pattern_end = re.compile(r"Quantitative and Qualitative Disclosures about Market",flags=re.IGNORECASE)

    print("\nEnding matches:")
    ends = re.finditer(pattern_end, soup_text)
    
    for i in ends:
        print(i)

    ends = [*re.finditer(pattern_end, soup_text)]
    print('len(ends)=',len(ends))

    if len(ends) ==1:
        end = ends[0].start()
    
    
print('\n\nStarting position = {}\n {} \n...\n {}\nEnding posisition ={}\n\n{}'.format(start, soup_text[start:end][:500],soup_text[start:end][-500:],end,soup_text[end:end+100]))

<title>Tricon Form 10K 2000</title>
Starting matches:
<re.Match object; span=(59381, 59482), match="Item 7.         Management's Discussion and Analy>

Ending matches:
len(starts)= 1
len(ends)= 0

Ending matches:
<re.Match object; span=(126174, 126227), match='Quantitative and Qualitative Disclosures About Ma>
len(ends)= 1


Starting position = 59381
 Item 7.         Management's Discussion and Analysis of Financial Condition and Results of Operations.IntroductionTRICON Global Restaurants, Inc. and Subsidiaries (collectively referred to as "TRICON" or the "Company") is comprised of the worldwide operations of KFC, Pizza Hut and Taco Bell ("the Concepts") and is the world's largest quick service restaurant ("QSR") company based on the number of system units. Separately, each brand ranks in the top ten among QSR chains in U.S. system sales and  
...
 lieve that the most critical activity regarding the conversion for our businesses is the completion of the rollout of Euro-ready point-of-s

In [313]:
# Verbose version

def identify_MDA(file_name):

    file_html = path+file_name
    with open(file_html) as f:
        soup_html = BeautifulSoup(f, 'html.parser')

    soup_text = soup_html.get_text(strip=True)
    soup_text =soup_text.replace('\xa0','').replace('\n','')
    
    
    pattern_toc = re.compile(r"(table of contents)|(index)", flags=re.IGNORECASE) 
    toc = re.search(pattern_toc, soup_text)
    ix = len(soup_html.find_all('ix:header'))


    pattern_start = re.compile(r"(?<![\"|“|'])Item\s?\d?.?\s*Management[’|']s[\s]*Discussion[\s]*and[\s]*Analysis[\s]*of[\s]*Financial[\s]*Condition[\s]*and[\s]*Results[\s]*of[\s]*Operations",flags=re.IGNORECASE)#Management's Discussion and Analysis of")
    starts = [*re.finditer(pattern_start, soup_text)]
    if len(starts) == 0:
        pattern_start = re.compile(r"(?<![\"|“|'])Management[’|']s[\s]*Discussion[\s]*and[\s]*Analysis[\s]*of[\s]*Financial[\s]*Condition[\s]*and Results[\s]*of[\s]*Operations",flags=re.IGNORECASE)#Management's Discussion and Analysis of")
        starts = [*re.finditer(pattern_start, soup_text)]

    if len(starts) == 1:
        start = starts[0].start()
    
    elif len(starts) > 1:        
        if toc:
            start = starts[1].start()
        else:
            start = 0
            print('\n>>>>>>NO TOC and MORE THAN 1 START POSITIONS!!!<<<<<<\n')
            pass #TODO
    else: 
        start = 0
        print('\n>>>>>>COULD NOT FIND ANY START POSITION!!!<<<<<<\n')
        pass #TODO

    
    pattern_end = re.compile(r"(?<![\"|“|'])Item\s?\d[A-Z]?.?\s*Quantitative[\s]*and[\s]*Qualitative[\s]*Disclosure[s]?[\s]*about[\s]*Market",flags=re.IGNORECASE)
    ends = [*re.finditer(pattern_end, soup_text)]
    if len(ends) == 0:
        pattern_end = re.compile(r"(?<![\"|“|'])Quantitative[\s]*and[\s]*Qualitative[\s]*Disclosure[s]?[\s]*about[\s]*Market",flags=re.IGNORECASE)
        ends = [*re.finditer(pattern_end, soup_text)]

    if len(ends) == 1:
        end = ends[0].start()
    
    elif len(ends) > 1:
        if toc:
            end = ends[1].start()
        else:
            for i in len(ends):
                if ends[i].start() > start + 10000:
                    end = ends[i].start()
                    break
            print('\n>>>>>>NO TOC and MORE THAN 1 END POSITIONS!!!<<<<<<\n')
            pass #TODO
        
    else:
        end = min(start+50000, len(soup_text))
        print('\n>>>>>>COULD NOT FIND ANY END POSITION!!!<<<<<<\n')
        pass #TODO
        
        # if end position is not found (some companies did not have Quantitiative and Qualitative section in earlier reports) 
        # then end = start + 50000
    
    
    print(soup_html.title.text,'IX=',ix,'size=',len(soup_text),'toc=',toc,'\n')

    print("Starting matches:")
    for i in starts:
        print(i,'\n...',soup_text[i.start()-100:i.start()+100])

    print("\nEnding matches:")
    for i in ends:
        print(i,'\n...',soup_text[i.start()-100:i.start()+100])

    print('\n\n{}\n----------------------->>> Starting position = {} <<<---------------------\n {} \n...\n {}\n----------------------->>> Ending posisition = {} <<<---------------------\n{}'.format(soup_text[start-100:start], start, soup_text[start:start+500],soup_text[end-500:end],end,soup_text[end:end+100]))
    if end<start:
        print("END > START !!!")
    print('\n============================================================================\n\n\n')
    
    return start, end



In [312]:

file_names = ["MRK/10-Q/0000310158-00-500003/filing-details.html",
              "MRK/10-Q/0000950123-10-102135/filing-details.html",
              "MRK/10-Q/0000310158-21-000009/filing-details.html"
]

for file_name in file_names:
    identify_MDA(file_name)
    



>>>>>>COULD NOT FIND ANY END POSITION!!!<<<<<<

             UNITED STATES SECURITIES AND EXCHANGE COMMISSION IX= 0 size= 42681 toc= None 

Starting matches:
<re.Match object; span=(22410, 22501), match="Item 2.Management's Discussion and Analysis of Fi> 
...  sheet from which it has been derived./s/Deloitte & Touche LLPParsippany, New JerseyNovember 7, 2000Item 2.Management's Discussion and Analysis of Financial Condition and Results ofOperationsResults o

Ending matches:


 sheet from which it has been derived./s/Deloitte & Touche LLPParsippany, New JerseyNovember 7, 2000
----------------------->>> Starting position = 22410 <<<---------------------
 Item 2.Management's Discussion and Analysis of Financial Condition and Results ofOperationsResults of Operations - three and nine months ended September 30, 2000 compared with the corresponding periods in 1999.Net SalesConsolidated net sales for the third quarter totaled $2.4 billion, an increase of $158 million or 7 percent compared wit

In [306]:
file_names = ["YUM/10-K/0001041061-01-500003/filing-details.html", 
              "YUM/10-Q/0001564590-16-029416/filing-details.html",
              "YUM/10-K/0001564590-21-009460/filing-details.html",
              
              "GOOG/10-Q/0001652044-21-000057/filing-details.html",
              "GOOG/10-K/0001652044-19-000004/filing-details.html",
              "GOOG/10-K/0001652044-21-000010/filing-details.html",

              "MRK/10-Q/0000310158-00-500003/filing-details.html",
              "MRK/10-K/0000310158-18-000005/filing-details.html",
              "MRK/10-K/0000310158-21-000004/filing-details.html",
              
              'D/10-Q/0000215466-12-000006/filing-details.html',
              'D/10-Q/0001564590-21-054856/filing-details.html',
              'D/10-K/0000882184-17-000103/filing-details.html'
             ]
record={}
for file_name in file_names:
    record[file_name] = identify_MDA(file_name)
    


Tricon Form 10K 2000 IX= 0 size= 348904 toc= <re.Match object; span=(129121, 129126), match='INDEX'> 

Starting matches:
<re.Match object; span=(58694, 58786), match="Item 7. Management's Discussion andAnalysis of Fi> 
...  and 1996 as our capital  structure as an  independent,  publicly owned company     did not exist.17Item 7. Management's Discussion andAnalysis of Financial Condition and Results of Operations.Introdu

Ending matches:
<re.Match object; span=(124716, 124769), match='Quantitative and Qualitative Disclosures About Ma> 
... uld have a material adverse impact on our results of operations,financial condition or cash flows.35Quantitative and Qualitative Disclosures About Market Risk of FinancialInstrumentsMarket Risk of Fin


 and 1996 as our capital  structure as an  independent,  publicly owned company     did not exist.17
----------------------->>> Starting position = 58694 <<<---------------------
 Item 7. Management's Discussion andAnalysis of Financial Condition and R

Document IX= 0 size= 318909 toc= <re.Match object; span=(5404, 5421), match='Table of Contents'> 

Starting matches:
<re.Match object; span=(5861, 5952), match='Item7.Management’s Discussion and Analysis of Fin> 
... lated Stockholder Matters and Issuer Purchases of Equity Securities21Item6.Selected Financial Data24Item7.Management’s Discussion and Analysis of Financial Condition and Results of Operations25Item7A.
<re.Match object; span=(110018, 110110), match='ITEM 7.MANAGEMENT’S DISCUSSION AND ANALYSIS OF FI> 
... 44Total stockholders’ equity$103,860$120,331$139,036$152,502$177,62824Table of ContentsAlphabet Inc.ITEM 7.MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONSPlease r

Ending matches:
<re.Match object; span=(5954, 6014), match='Item7A.Quantitative and Qualitative Disclosures A> 
...  Data24Item7.Management’s Discussion and Analysis of Financial Condition and Results of Operations25Item7A.Quantitative and Qualitative Disclosures About Market

mrk-20201231 IX= 1 size= 732205 toc= <re.Match object; span=(166430, 166447), match='Table of Contents'> 

Starting matches:
<re.Match object; span=(166835, 166927), match='Item 7.Management’s Discussion and Analysis of Fi> 
...  Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities45Item 7.Management’s Discussion and Analysis of Financial Condition and Results of Operations47Item 7
<re.Match object; span=(360757, 360849), match='Item 7.Management’s Discussion and Analysis of Fi> 
... rates it by reference into a filing under the Securities Act or the Exchange Act.46Table of ContentsItem 7.Management’s Discussion and Analysis of Financial Condition and Results of Operations.The fol

Ending matches:
<re.Match object; span=(166929, 166990), match='Item 7A.Quantitative and Qualitative Disclosures > 
... ties45Item 7.Management’s Discussion and Analysis of Financial Condition and Results of Operations47Item 7A.Quantitative and Qualitative Disclos

Document IX= 0 size= 341493 toc= <re.Match object; span=(3932, 3949), match='TABLE OF CONTENTS'> 

Starting matches:
<re.Match object; span=(4262, 4354), match='ITEM 7.Management’s Discussion and Analysis of Fi> 
... lated Stockholder Mattersand Issuer Purchases of Equity Securities23ITEM 6.Selected Financial Data25ITEM 7.Management’s Discussion and Analysis of Financial Condition and Results of Operations26ITEM 7
<re.Match object; span=(89795, 89887), match='ITEM 7.MANAGEMENT’S DISCUSSION AND ANALYSIS OF FI> 
... ludes both homebuilding notes payable and amounts outstanding on our mortgage repurchase facility.25ITEM 7.MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONSResults 

Ending matches:
<re.Match object; span=(4356, 4417), match='ITEM 7A.Quantitative and Qualitative Disclosures > 
... Data25ITEM 7.Management’s Discussion and Analysis of Financial Condition and Results of Operations26ITEM 7A.Quantitative and Qualitative Disclosures About Market 

In [355]:
# Clean version

def identify_MDA_save(file_name):
    
    with open(file_name) as f:
        soup_html = BeautifulSoup(f, 'html.parser')

    soup_text = soup_html.get_text(strip=True)
    soup_text = soup_text.replace('\xa0','').replace('\n','')
    
    pattern_toc = re.compile(r"(table of contents)|(index)", flags=re.IGNORECASE) 
    toc = re.search(pattern_toc, soup_text)
    ix = len(soup_html.find_all('ix:header'))

    pattern_start = re.compile(r"(?<![\"|“|'])Item\s?\d?.?\s*Management[’|']s[\s]*Discussion[\s]*and[\s]*Analysis[\s]*of[\s]*Financial[\s]*Condition[\s]*and[\s]*Results[\s]*of[\s]*Operations",flags=re.IGNORECASE)#Management's Discussion and Analysis of")
    starts = [*re.finditer(pattern_start, soup_text)]
    if len(starts) == 0:
        pattern_start = re.compile(r"(?<![\"|“|'])Management[’|']s[\s]*Discussion[\s]*and[\s]*Analysis[\s]*of[\s]*Financial[\s]*Condition[\s]*and Results[\s]*of[\s]*Operations",flags=re.IGNORECASE)#Management's Discussion and Analysis of")
        starts = [*re.finditer(pattern_start, soup_text)]

    if len(starts) == 1:
        start = starts[0].start()
    
    elif len(starts) > 1:        
        if toc:
            start = starts[1].start()
        else:
            start = 0
    else: 
        start = 0

    
    pattern_end = re.compile(r"(?<![\"|“|'])Item\s?\d[A-Z]?.?\s*Quantitative[\s]*and[\s]*Qualitative[\s]*Disclosure[s]?[\s]*about[\s]*Market",flags=re.IGNORECASE)
    ends = [*re.finditer(pattern_end, soup_text)]
    if len(ends) == 0:
        pattern_end = re.compile(r"(?<![\"|“|'])Quantitative[\s]*and[\s]*Qualitative[\s]*Disclosure[s]?[\s]*about[\s]*Market",flags=re.IGNORECASE)
        ends = [*re.finditer(pattern_end, soup_text)]

    if len(ends) == 1:
        end = ends[0].start()
    
    elif len(ends) > 1:
        if toc:
            end = ends[1].start()
        else:
            for i in len(ends):
                if ends[i].start() > start + 10000:
                    end = ends[i].start()
                    break
        
    else:
        end = min(start+50000, len(soup_text))
    
    return ix, start, end


In [333]:
# Add Google old files (before name change) to the sample 2

sample2 = pd.read_csv(path[:-8]+'statistics_samples2.csv')

ticker = 'GOOG_1288776'
K = os.listdir(path+ticker+"/10-K/")
Q = os.listdir(path+ticker+"/10-Q/")
add_google = pd.DataFrame({'ticker':[ticker], '10K_files':[K], '10Q_files':[Q], 'k_count':[len(K)], 'q_count':[len(Q)]})

sample2 = sample2.append(add_google)
sample2.set_index('ticker', inplace=True)

# Delete AVGO since it has been added to the IT sample
sample2.drop('AVGO',axis='index',inplace=True)

sample2.index

Index(['OGN', 'NI', 'GIS', 'D', 'GL', 'IEX', 'GOOG', 'SIVB', 'YUM',
       'GOOG_1288776'],
      dtype='object', name='ticker')

In [402]:
# Start Preprocessing

from ast import literal_eval 
# pandas store list as string; need to convert back

temp = []
for ticker in sample2.index:
    for doc in literal_eval(sample2.loc[ticker,'10K_files']):
        if len(doc)!=0:
            d = {}
            d['ticker'] = ticker
            d['type']='10K'
            d['file']= doc
            folder = path+ticker+"/10-K/"+doc+"/"
            file = os.listdir(folder)[0]
            file_name = folder+"/"+file
            d['ix'], d['start'], d['end'] = identify_MDA_save(file_name)
            temp.append(d)
    print(d)
    for doc in literal_eval(sample2.loc[ticker,'10Q_files']):
        d = {}
        d['ticker'] = ticker
        d['type']='10Q'
        d['file']= doc
#        file_name = path+ticker+"/10-Q/"+doc+'/filing-details.html'
        folder = path+ticker+"/10-Q/"+doc+"/"
        file = os.listdir(folder)[0]
        file_name = folder+"/"+file
        d['ix'], d['start'], d['end'] = identify_MDA_save(file_name)
        temp.append(d)
    print(d)
    
df = pd.DataFrame(temp)

df.to_pickle(path[:-8]+'/sample2_scan.pkl')


{'ticker': 'GOOG_1288776', 'type': '10Q', 'file': '0001288776-15-000046', 'ix': 0, 'start': 87124, 'end': 132338}
{'ticker': 'OGN', 'type': '10Q', 'file': '0001821825-21-000005', 'ix': 1, 'start': 104964, 'end': 134120}
{'ticker': 'NI', 'type': '10K', 'file': '0000893220-02-000181', 'ix': 0, 'start': 51419, 'end': 101419}
{'ticker': 'NI', 'type': '10Q', 'file': '0000893220-01-500163', 'ix': 0, 'start': 62250, 'end': 115571}
{'ticker': 'GIS', 'type': '10K', 'file': '0000897101-02-000567', 'ix': 0, 'start': 0, 'end': 41001}
{'ticker': 'GIS', 'type': '10Q', 'file': '0000897101-01-500115', 'ix': 0, 'start': 20026, 'end': 31816}
{'ticker': 'D', 'type': '10K', 'file': '0000929638-17-000398', 'ix': 0, 'start': 8785, 'end': 8878}
{'ticker': 'D', 'type': '10Q', 'file': '0001224952-11-000006', 'ix': 0, 'start': 41849, 'end': 89982}
{'ticker': 'GL', 'type': '10K', 'file': '0000931763-02-000748', 'ix': 0, 'start': 75679, 'end': 125679}
{'ticker': 'GL', 'type': '10Q', 'file': '0000931763-01-500605'

In [418]:
# Statistics Overview

# df = pd.read_pickle(path[:-8]+'/sample2_scan.pkl')

df['len'] = df['end'] - df['start']

df.describe()

Unnamed: 0,ix,start,end,len
count,738.0,738.0,738.0,738.0
mean,0.169377,86667.017615,155928.838753,69261.821138
std,0.375339,70979.879171,91618.922076,47968.202704
min,0.0,0.0,1745.0,-107727.0
25%,0.0,46657.75,98541.75,41239.5
50%,0.0,66810.0,139889.0,60193.0
75%,0.0,104968.5,193682.5,84251.0
max,1.0,513509.0,629820.0,266196.0


In [413]:
print('Number of financial documents by ticker')
df[['ticker','file']].groupby(by='ticker').count()

Number of financial documents by ticker


Unnamed: 0_level_0,file
ticker,Unnamed: 1_level_1
D,130
GIS,81
GL,83
GOOG,25
GOOG_1288776,47
IEX,82
NI,99
OGN,2
SIVB,85
YUM,104


In [424]:
print('% of documents with MD&A sections PROBABLY identified')
df[['ticker','file']][(df['start']>10000)&(df['len']>10000)].groupby(by='ticker').count()/df[['ticker','file']].groupby(by='ticker').count()

% of documents with MD&A sections PROBABLY identified


Unnamed: 0_level_0,file
ticker,Unnamed: 1_level_1
D,0.815385
GIS,0.925926
GL,1.0
GOOG,1.0
GOOG_1288776,0.914894
IEX,0.902439
NI,0.848485
OGN,1.0
SIVB,0.941176
YUM,0.932692


In [451]:
total = len(df)
start_NOT = sum(df['start']==0)
end_NOT = sum(df['len']==50000)
start_miss = sum((df['start']<10000)&(df['start']>0))
end_miss = sum((df['start']>10000)&(df['len']<10000))


print("Number of files with START position NOT identified: #{} ({:.1f}% of total)".format(start_NOT,start_NOT/total*100))
print("Number of files with START position mis-identified: #{} ({:.1f}% of total)".format(start_miss,start_miss/total*100))

print("Number of files with END position NOT identified: #{} ({:.1f}% of total)".format(end_NOT,end_NOT/total*100))
print("Number of files with END position mis-identified: #{} ({:.1f}% of total)".format(end_miss,end_miss/total*100))


Number of files with START position NOT identified: 6 (0.8% of total)
Number of files with START position mis-identified: 42 (5.7% of total)
Number of files with END position NOT identified: 12 (1.6% of total)
Number of files with END position mis-identified: 21 (2.8% of total)


In [453]:
df_miss = df[(df['start']<10000) | (df['len']<10000) | (df['len']==50000)]

print('Overview of mis-identifications: #{} ({:.1f}% of total)'.format(len(df_miss),len(df_miss)/total*100))

df_miss

Overview of mis-identifications: #78 (10.6% of total)


Unnamed: 0,ticker,type,file,ix,start,end,len
2,NI,10K,0001174947-21-000255,1,242357,292357,50000
3,NI,10K,0001111711-21-000010,1,190463,82736,-107727
4,NI,10K,0001174947-20-000354,0,161745,211745,50000
6,NI,10K,0001174947-19-000367,0,151584,201584,50000
8,NI,10K,0001174947-18-000312,0,138139,188139,50000
10,NI,10K,0001174947-17-000571,0,137284,187284,50000
24,NI,10K,0000893220-02-000181,0,51419,101419,50000
53,NI,10Q,0001174947-17-000865,0,2899,142989,140090
62,NI,10Q,0001111711-14-000052,0,2592,194485,191893
63,NI,10Q,0001111711-14-000036,0,2593,177544,174951


### Financial Numbers Extraction

Note: For most companies, only available for the last 2-3 years (from 2019 onwards) after adoption of Inline XBRL standard.

In [6]:
# Revenue Extraction

print('Revenue Extractoin')
for i in soup_html.find_all('ix:nonfraction',{'name':'us-gaap:Revenues'}):
    context_id = i['contextref']
    context = soup_html.find_all('xbrli:context', {'id': context_id})[0]
    period = context.find('xbrli:period')
    start_date = period.find('xbrli:startdate').text
    end_date = period.find('xbrli:enddate').text

    if context.find('xbrli:segment'):
        label = ['Segment']
        for l in context.find_all('xbrldi:explicitmember'):
            label.append(l.text)
    else:
        label = 'Total'

    print(i.text, start_date, end_date, label)


# Operating Profit Extracton

print("\n\nOperating Profit Extraction")
for i in soup_html.find_all('ix:nonfraction',{'name':'us-gaap:OperatingIncomeLoss'}):
    context_id = i['contextref']
    context = soup_html.find_all('xbrli:context', {'id': context_id})[0]
    period = context.find('xbrli:period')
    start_date = period.find('xbrli:startdate').text
    end_date = period.find('xbrli:enddate').text

    if context.find('xbrli:segment'):
        label = ['Segment']
        for l in context.find_all('xbrldi:explicitmember'):
            label.append(l.text)
    else:
        label = 'Total'

    print(i.text, start_date, end_date, label)


# us-gaap:CostOfRevenue - typically only at the company level; not disclosed for segments
# us-gaap:CostsAndExpenses - typically only at the company level
# us-gaap:NetIncomeLoss - typically only at the company level;

# Net Income/Loss Extraction only at the group level

print("\n\nNet Income/Loss Extraction")
for i in soup_html.find_all('ix:nonfraction',{'name':'us-gaap:NetIncomeLoss'}):
    context_id = i['contextref']
    context = soup_html.find_all('xbrli:context', {'id': context_id})[0]
    period = context.find('xbrli:period')
    start_date = period.find('xbrli:startdate').text
    end_date = period.find('xbrli:enddate').text

    if not context.find('xbrli:segment'):
        print(i.text, start_date, end_date)



Revenue Extractoin


Operating Profit Extraction
11,213 2020-07-01 2020-09-30 Total
21,031 2021-07-01 2021-09-30 Total
25,573 2020-01-01 2020-09-30 Total
56,829 2021-01-01 2021-09-30 Total
14,453 2020-07-01 2020-09-30 ['Segment', 'goog:GoogleServicesSegmentMember', 'us-gaap:OperatingSegmentsMember']
23,973 2021-07-01 2021-09-30 ['Segment', 'goog:GoogleServicesSegmentMember', 'us-gaap:OperatingSegmentsMember']
35,540 2020-01-01 2020-09-30 ['Segment', 'goog:GoogleServicesSegmentMember', 'us-gaap:OperatingSegmentsMember']
65,862 2021-01-01 2021-09-30 ['Segment', 'goog:GoogleServicesSegmentMember', 'us-gaap:OperatingSegmentsMember']
1,208 2020-07-01 2020-09-30 ['Segment', 'goog:GoogleCloudSegmentMember', 'us-gaap:OperatingSegmentsMember']
644 2021-07-01 2021-09-30 ['Segment', 'goog:GoogleCloudSegmentMember', 'us-gaap:OperatingSegmentsMember']
4,364 2020-01-01 2020-09-30 ['Segment', 'goog:GoogleCloudSegmentMember', 'us-gaap:OperatingSegmentsMember']
2,209 2021-01-01 2021-09-30 ['Segment', '