In [1137]:
import pandas as pd
import re
import webbrowser
import numpy as np
from tqdm import tqdm_notebook as tqdm
import pprint
import pickle
import dask
from dask.diagnostics import ProgressBar
from datetime import datetime


In [1138]:
which_year = 2017
cpu_core = 6

In [1139]:
data_fp = '/Users/dylan/Downloads/10Q_item2'

# load master table

In [1140]:
pd_10Q = pd.read_csv(f'{data_fp}/10Q_master_v1.csv')

In [1141]:
pd_10Q.columns = list(map(lambda _s: _s.lower(), pd_10Q.columns))

In [1142]:
pd_10Q.head(1)

Unnamed: 0,unnamed: 0,cik,filing_date,fye,sic,ffind,file_name,n_words,n_unique_words,org_index,company_name,file_hash,file_hash_count
0,0,763950,2016-01-04,20150930,3823,37,/Users/dylan/Downloads/10Q_item2/10-X_C_2016-2...,8127,1144,943316,UNIVERSAL DETECTION TECHNOLOGY,0001078782-16-002115_3,1


In [1143]:
# FILING_DATE to dt object
pd_10Q.filing_date = pd_10Q.filing_date.apply(lambda str: datetime.strptime(str,'%Y-%m-%d'))

In [1144]:
pd_10Q.file_name = pd_10Q.file_name.apply(lambda str: str.replace('/Users/dylan/Downloads/10Q_item2/', ""))

In [1145]:
pd_10Q = pd_10Q.rename( columns={pd_10Q.columns[0]:'pd_10q_idx'})

# helper func

## get url

In [1146]:
# Ensemble edgar link from file name.
def get_edgar_link(fname, open_in_browser=True):
    base_url = 'https://www.sec.gov/Archives/edgar/data/'
    re_result = re.search('edgar_data_(\d+)_(.+)_(\d+).txt', fname)
    fname_major = re_result.group(2)
    edgar_link = base_url + re_result.group(1) + '/' + re.sub('\-|_', '', fname_major) + '/' + fname_major + '-index.htm'
    if open_in_browser:
        webbrowser.open(edgar_link)
    return edgar_link

In [1147]:
# an example 
# url =https://www.sec.gov/Archives/edgar/data/763950/000107878216002115/0001078782-16-002115-index.htm
#a_file_name = pd_10Q.file_name[0]
print(a_file_name)
#get_edgar_link(a_file_name, False)

10-X_C_2016-2018/2016/QTR1/20160104_10-Q_edgar_data_763950_0001078782-16-002115_3.txt


## check the content

In [1148]:
def print_raw_str(idx):
    '''
    Check an example filing by index in `pd_10K`.
    '''
    fname = pd_10Q.iloc[idx]['file_name']
    print(fname)
    with open(fname, 'r') as f:
        print(f.read())

Substitution fixes applied to list of doc strings.

# read files

1. Read in files as string
2. Remove all the exibits
3. Remove the header section
4. Other

In [1149]:
re_m_exbibit = re.compile('<N_Exhibits>(\d+)</N_Exhibits>')
re_f_exbibit = re.compile('<EX-.*')
re_f_exbibit_d = re.compile('<EX-(\d+\.?\d*).*')

In [1150]:
pd_10Q.columns

Index(['pd_10q_idx', 'cik', 'filing_date', 'fye', 'sic', 'ffind', 'file_name',
       'n_words', 'n_unique_words', 'org_index', 'company_name', 'file_hash',
       'file_hash_count'],
      dtype='object')

In [1151]:
def get_filing_str_l(year):
    print("n_row:", (pd_10Q.filing_date.dt.year==year).sum())
    filing_str_l = []
    pd_idx_l = []
    idx_d = {}
    for i, (pd_idx, row) in tqdm(enumerate(pd_10Q[pd_10Q.filing_date.dt.year==year].iterrows()), total=len(pd_10Q[pd_10Q.filing_date.dt.year==year])):
        # read file
        one_10q_fname = f"{data_fp}/{row['file_name']}"
        with open(one_10q_fname, 'r') as f:
            doc_str = f.read()
        # remove exibits
        # by construction, we know every doc has a N_Exhibits tag
        # so the search should not return none
        n_exb = int(re_m_exbibit.search(doc_str).group(1))
        if n_exb > 0:
            exb_l = list(re_f_exbibit.finditer(doc_str))
            assert len(exb_l) == n_exb
            doc_str = doc_str[:exb_l[0].start()]
        # remove header
        doc_str= doc_str[(doc_str.index('</Header>') + 9):]
        # replace tab with space
        doc_str = doc_str.replace('\t', ' ')
        # output
        filing_str_l.append(doc_str)
        pd_idx_l.append(pd_idx)
        idx_d[pd_idx] = i
    
    return filing_str_l, pd_idx_l, idx_d

In [1152]:
filing_str_l, pd_idx_l, idx_d = get_filing_str_l(which_year)

n_row: 19541


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, (pd_idx, row) in tqdm(enumerate(pd_10Q[pd_10Q.filing_date.dt.year==year].iterrows()), total=len(pd_10Q[pd_10Q.filing_date.dt.year==year])):


HBox(children=(FloatProgress(value=0.0, max=19541.0), HTML(value='')))




# Normalization

In [1153]:
spell_fixes = [
    # the first two rules keep capitalization of letters
    (re.compile('(i)(\s*?t\s*?e\s*?m)', re.I|re.M), '\g<1>tem'),  # i t e       m
    (re.compile('(i)(tem\s+?s)([^\w\-])', re.I|re.M), '\g<1>tems\g<3>'),  # item       s?
    (re.compile('(b?\s{,4}?u\s{,4}?s\s{,4}?i\s{,4}?n\s{,4}?e\s{,4}?s\s{,4}?s)(\W)', re.I|re.M), 'business\g<2>'),
]     #  have no idea what is this. {0,4} btw 0,4

In [1154]:
def apply_sub_fixs(str_l, fix_l, squeezed=False):
    if squeezed:
        fix_l = [fix_l]
    for i, doc_str in tqdm(enumerate(str_l), total=len(str_l)):
        for re_obj, repls in fix_l:
            doc_str = re_obj.sub(repls, doc_str)
        str_l[i] = doc_str
    return str_l

# save all processed filing_str into tmp

In [1206]:
# save temp results
def save_tmp(data_fp, year_start, year_end):
    '''
    using both:
        read: get_filing_str_l
        normalize: apply_sub_fixs
    save to tmp folder
    '''
    
    if os.path.exists(f"{data_fp}/tmp") == False:
        os.makedirs(f"{data_fp}/tmp")
    for y in tqdm(range(year_start, year_end+1)):
        print('-------- processing: ',y)# tqdm 1
        filing_str_l, pd_idx_l, idx_d = get_filing_str_l(y)  # tqdm 2
        filing_str_l = apply_sub_fixs(filing_str_l, spell_fixes)  # tqdm 3
        with open(f"{data_fp}/tmp/filing_{y}.data", 'wb') as f:
            pickle.dump([filing_str_l, pd_idx_l, idx_d], f)

        tmp = pd_10Q.iloc[np.random.choice(pd_idx_l)]  # randomly listed
       
        assert tmp.filing_date.year == y  
        

In [1207]:
save_tmp(data_fp, 2016, 2017)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for y in tqdm(range(year_start, year_end+1)):     # tqdm 1


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

n_row: 20370


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, (pd_idx, row) in tqdm(enumerate(pd_10Q[pd_10Q.filing_date.dt.year==year].iterrows()), total=len(pd_10Q[pd_10Q.filing_date.dt.year==year])):


HBox(children=(FloatProgress(value=0.0, max=20370.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, doc_str in tqdm(enumerate(str_l), total=len(str_l)):


HBox(children=(FloatProgress(value=0.0, max=20370.0), HTML(value='')))


-------- processing:  2016
n_row: 19541


HBox(children=(FloatProgress(value=0.0, max=19541.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=19541.0), HTML(value='')))


-------- processing:  2017



# Extract

Identify the start of section Item 2 by searching for interesting patterns. These patterns are discovered by trial and error

load temp results:

In [1158]:
year = which_year
s1_fname = data_fp + f'/tmp/filing_{year}.data'

In [1159]:
with open(s1_fname, 'rb') as f:
    filing_str_l, pd_idx_l, idx_d = pickle.load(f)

In [1160]:
# number of 10Q in this year
n_docs = len(filing_str_l)
#n_docs

In [1161]:
#filing_str_l[0][:1000]  # origin; \n (new line)

In [1162]:
# print one
#print(filing_str_l[3])

### Method

View a document as a character sequence. 

Given a document and a particular subsequence of characters in the document, the span of the subsequence is a integer tuple  (𝑥,𝑦)  where  𝑥  marks the start position of the subsequence in the document while  𝑦  marks the end position.

There are 2 interesting types of spans for each document:

+ The span of the title marking the start of Item 2(Part I).

+ The span of the title marking the end of the desired content. 

`span_pd`: So prepare 2 columns in `span_pd` which stores the span values. 

`status_pd`: For each type of spans, it is possible that we can't find any matched subsequence. So `status_pd` records detailed failure information for debugging.

In [1163]:
# DataFrame(a dict {i: [a list]})
span_pd = pd.DataFrame.from_dict({i: [(0, 0), (0, 0)] for i in range(n_docs)}, orient='index')
status_pd = pd.DataFrame.from_dict({i: [0, 0] for i in range(n_docs)}, orient='index')

status codes: 
+ -1, uninitialized (doc not visited by the extract function, buggy case); 
+ 0, so far failed, and no clues of the reason; 
+ 1, succeeded; 
+ 2, promising, but need further check; 
+ 3, need special treatment;
+ 4, has confirmed that there is no item 1 section;

In [1164]:
display(span_pd.head(1), status_pd.head(1))

Unnamed: 0,0,1
0,"(0, 0)","(0, 0)"


Unnamed: 0,0,1
0,0,0


## Start of Item2

Item 2. Management’s Discussion and Analysis of Financial Condition and Results of Operations. 


For letter:

very common ones: `Item 2`, `ITEM 2`, `Item 2.`, `ITEM 2.`...
1. number of space varys
2. must locates in the head of a line
3. after `item 2`, may be variational number of space, or \n



` MANAGEMENT S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS` (may have \n inside this title)

` Management s Discussion...`(may have \n inside this title)

`Management's...`



For position:

May find 1 or 2 positions (in index, there may be one); 

    + if =2, later one;
    + if =0, error


+ rule1; match: 
    - 1.lower case; `item 2`, `item no.2`
    - 2.must at head of a line;
    - 3.could possible followed by `,` `:` `.` `\n` `\-`

Watch Out, not all 10-Q documents contain item 2 (part 1)

IDEA:

strict+general to strict+specific

1. item 2 + literal

2. literal

### estimate idx location

#### get loc of start of index

In [1165]:
# item 2 in context
pattern_idx_st = re.compile("(^\s*?|\s{4,}|\\n+)((i)(tem)\s*?n?o?\s*?\.?\s*s?\s*?)(2)\s*?([,: .\n\-\s]+)([a-zA-Z\s\,\.]{0,50})\s{1,}(Management|trustee)([a-zA-Z\s\']{1,10})\s*?(Discussion)([a-zA-Z\s]{10,90})(\s{4,}|\n+|\.+)", re.I|re.M) # the first time show "item"


In [1166]:
def estimate_index_loc(max_i):
    idx_l = []
    valid_ = 0
    for i in tqdm(range(max_i)):
        try:
            loc_ = list(pattern_idx_st.finditer(filing_str_l[i]))[1].span(0)[0]
            idx_l.append(loc_)
            valid_ += 1
        except:
            continue
    return idx_l
            

In [1167]:
#locs = np.array(estimate_index_loc(3000))

#print('idx_min = ',locs.min(),'\n',
#      '0.01q = ',np.quantile(locs,0.01) )
      

#### get loc of end of index

In [1168]:
# not necessary to be very strict
# using part I
pattern_idx_end = re.compile("(^\s*?|\s{4,}|\\n+)(PART I)([,: .\n\-\s]+)(\s{4,}|\n+|\.+)", re.I|re.M)

In [1169]:
def estimate_index_loc2(max_i):
    idx_l2 = []
    valid_ = 0
    for i in tqdm(range(max_i)):
        try:
            loc_ = list(pattern_idx_end.finditer(filing_str_l[i]))[1].span(0)[0]
            idx_l2.append(loc_)
            valid_ += 1
        except:
            continue
    return idx_l2


In [1170]:
#locs2 = np.array(estimate_index_loc2(3000))


In [1171]:

#print('idx_min = ',locs2.min(),'\n',
#      '0.9q = ',np.quantile(locs2,0.9) )
  

90%, Content begining < 9273

99%, item 2 start > 7945

set threshhold as 8000

### begin of item 2 content

In [1172]:
# pattern 1: match item 2 + management or discussion

pattern1 = re.compile("(^\s*?|\s{4,}|\\n+)((i)(tem)\s*?n?o?\s*?\.?\s*s?\s*?)(2)\s*?([,: .\n\-\s]+)([a-zA-Z\s\,\.]{0,50})\s{1,}(((Management|trustee)([a-zA-Z\s\']{10,90}))|((discussion)([a-zA-Z\s]{10,80})))(\s{4,}|\n+|\.+)", re.I|re.M)

pattern1_1 = re.compile("item", re.I|re.M)


In [1173]:
# pure literal, match "management discussion and analysis of financial  CONDITION AND RESULTS OF OPERATIONS "
# case 1: do not write "item 2" before title
# case 2: item 1, 2, 3, and 4....like this
pattern2 = re.compile("(^\s*?|\s{4,}|\\n+)(Management|trustee)([a-zA-Z\s\']{1,10})\s*?\n*?(Discussion)(\s+|\n+)(and)(\s+|\n+)(analysis)(\s+|\n+)(of)(\s+|\n+)(financial)(\s+|\n+)(condition)(\s+|\n+)(and)(\s+|\n+)(results)(\s+|\n+)(of)(\s+|\n+)(operations)(\s{2,}|\n+|\.+)", re.I|re.M)
pattern2_1 = re.compile("Management|trustee", re.I|re.M)



In [1174]:
@dask.delayed
def find_item2_start(i):
    '''
    finding the start of item 2 in ith document
    
    no return,
    update span_pd[i,0] and status_pd[i,0]
    '''
    target = filing_str_l[i]
    
    # default using pattern 1 
    finds_l = list(pattern1.finditer(target))
    
    if len(finds_l) > 1: # ideally, len should be 2; some times was split to multiple
        if finds_l[1].span(0)[0] > 8000:
            span_pd.loc[i, 0] = finds_l[1].span(0)  # begin with "item"   
            status_pd.loc[i, 0] = 1
        elif finds_l[1].span(0)[0] > 2500:
            span_pd.loc[i, 0] = finds_l[1].span(0)    # update for further check
            status_pd.loc[i, 0] = 2
            print(f"WARNING: hybrid, more than 2 found, location of item 2 < 8000 in filing_str_l[{i}]") 
        else:
            status_pd.loc[i, 0] = 3 # do not update
            print(f"FAIL: hybrid, more than 2 found, location of item 2 < 2500 in filing_str_l[{i}]") 

    elif len(finds_l) == 1:
        if finds_l[0].span(0)[0] > 8000:
            span_pd.loc[i, 0] = finds_l[0].span(0)   
            status_pd.loc[i, 0] = 1
        elif finds_l[0].span(0)[0] > 2500:
            span_pd.loc[i, 0] = finds_l[0].span(0)    # update anyway
            status_pd.loc[i, 0] = 2
            print(f"WARNING: hybrid, only 1 found, location of item 2 < 8000, in filing_str_l[{i}]")  
        else:
            status_pd.loc[i, 0] = 3 # do not update
            print(f"FAIL: hybrid, only 1 found, location of item 2 < 2500 in filing_str_l[{i}]") 
    else:
        # using literally match
        finds_l = list(pattern2.finditer(target))            
        if len(finds_l) >= 1:

            if finds_l[0].span(0)[0] > 8000:
                span_pd.loc[i, 0] = finds_l[0].span(0)  # get the 1st
                status_pd.loc[i, 0] = 1
            elif finds_l[0].span(0)[0] > 2500:
                span_pd.loc[i, 0] = finds_l[0].span(0)    # update anyway
                status_pd.loc[i, 0] = 2
                print(f"WARNING: literally, location of letter < 8000 in filing_str_l[{i}]")   
            else:
                status_pd.loc[i, 0] = 3
                print(f"FAIL: literally, location of letter < 2500 in filing_str_l[{i}]") 
        else:
            status_pd.loc[i, 0] = -1
            print(f'FAIL: no "item2" was found in filing_str_l[{i}]')



In [1175]:
# refresh
span_pd = pd.DataFrame.from_dict({i: [(0, 0), (0, 0)] for i in range(n_docs)}, orient='index')
status_pd = pd.DataFrame.from_dict({i: [0, 0] for i in range(n_docs)}, orient='index')



In [1176]:
update_tasks_s = [find_item2_start(i) for i in tqdm(range(len(filing_str_l[:300])))]
    

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  update_tasks_s = [find_item2_start(i) for i in tqdm(range(len(filing_str_l[:300])))]


HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))




In [1177]:
with ProgressBar():
    dask.compute(update_tasks_s, num_workers=cpu_core*2)

FAIL: no "item2" was found in filing_str_l[107]
[##                                      ] | 6% Completed |  0.4sFAIL: no "item2" was found in filing_str_l[88]
FAIL: no "item2" was found in filing_str_l[105]
[#########                               ] | 24% Completed |  1.1sFAIL: no "item2" was found in filing_str_l[258]
[#################                       ] | 43% Completed |  1.7sFAIL: hybrid, only 1 found, location of item 2 < 2500 in filing_str_l[252]
[#############################           ] | 74% Completed |  3.1sFAIL: no "item2" was found in filing_str_l[131]
[################################        ] | 81% Completed |  3.3sFAIL: no "item2" was found in filing_str_l[25]
[########################################] | 100% Completed |  4.0s


In [1178]:
print('no update recorded:          ', sum(status_pd[0] == 0), '\n'
      'succeeded:                   ', sum(status_pd[0] == 1), '\n',
      'updated,          loc < 8000 :',sum(status_pd[0] == 2),'\n',
      'marked as failed, loc < 2500: ', sum(status_pd[0] == 3), '\n',
      'confirmed no item 2:          ', sum(status_pd[0] == -1), '\n',
      sep='')

no update recorded:          19241
succeeded:                   288
updated,          loc < 8000 :5
marked as failed, loc < 2500: 1
confirmed no item 2:          6



comfirmed no item 2: 

3 , 2654 , 62, 177, 355, 414,

11 has splited

355 no item 2

#### manually check

In [1179]:
def manual_check(check_l):
    for i in check_l:  
        yield i,'\n',filing_str_l[i]

In [1180]:
#check_size = 5
#check_list = np.random.choice(np.array(status_pd.loc[:,0][status_pd[0] == 3].index), size = check_size, replace = True)

In [1181]:
#check = manual_check(check_list)

In [1182]:
# run this
#print(check.__next__())

## End of Match

In [1183]:
# item 3.
rule1 = re.compile("(^\s*?|\s{2,}|\\n+)((i)(tem)\s*?n?o?\s*?\.?\s*s?\s*?)(3)\s*?([,: .\n\-])([\s\-]*?)([a-zA-Z\s\,\.]{0,50})\s{1,}(((quantitative)([a-zA-Z\s\']{10,90}))|((qualitative)([a-zA-Z\s]{10,80})))(\s{4,}|\n+|\.+)", re.I|re.M)



In [1184]:
# item 4.
# sometimes, type wrong item (2, 3, 4T)
rule2 = re.compile("(^\s*?|\s{2,}|\\n+)((i)(tem)\s*?n?o?\s*?\.?\s*s?\s*?)(4|3|4T)\s*?([,: .\n\-])([a-zA-Z\s\,\.]{0,50})([\s|\-]*?)(((controls)([a-zA-Z\s]{10,20}))|((procedures)))(\s{4,}|\n+|\.+)", re.I|re.M)


In [1185]:
# part 2
rule3 = re.compile("(^\s*?|\s{2,}|\\n+)((part)\s*?(ii))\s*?([,: .\n\-])([\s|\-]*?)(other)\s*?([,: .\n\-])(information)(\s{4,}|\n+|\.+)", re.I|re.M)



In [1186]:
# pure letter, item 3
rule4a = re.compile("(^\s*?|\s{4,}|\\n+)(quantitative)(\s+|\n+)(and)(\s+|\n+)(qualitative)(\s+|\n+)(disclosures)(\s+|\n+)(about)(\s+|\n+)(market)(\s+|\n+)(risk)(\s{2,}|\n+|\.+)", re.I|re.M)
# item 4
rule4b = re.compile("(^\s*?|\s{4,}|\\n+)(controls)(\s+|\n+)(and)(\s+|\n+)(procedures)(\s{2,}|\n+|\.+)", re.I|re.M)

In [1187]:
rules = [rule1, rule2, rule3, rule4a, rule4b]

In [1188]:
def estimate_item3_loc(max_i):
    idx_l2 = []
    valid_ = 0
    for i in tqdm(range(max_i)):
        try:
            loc_ = list(rule1.finditer(filing_str_l[i]))[1].span(0)[0]
            idx_l2.append(loc_)
            valid_ += 1
        except:
            continue
    return idx_l2


In [1189]:
#locs_it3 = np.array(estimate_item3_loc(3000))


In [1190]:
#print('idx_min = ',locs_it3.min(),'\n',
#      '0.01q = ',np.quantile(locs_it3,0.01) )

In [1191]:
@dask.delayed
def find_item2_end(i):
    target = filing_str_l[i]
    
    if status_pd.loc[i, 0] not in [1, 2]:
        print(f"ignore{i}")
    else:
        for n_rule, rule in enumerate(rules):
            finds_l = list(rule.finditer(target))
            if len(finds_l) >= 1:
                for k, mo in enumerate(finds_l): # mo means match object
                    #print(mo.span(0)[0] ,"\n", span_pd.loc[i, 0][0])
                    #print(k,mo.span(0)[0],span_pd.loc[i, 0][0])
                    if (mo.span(0)[0] > span_pd.loc[i, 0][0]):
                        span_pd.loc[i, 1] = mo.span(0)
                        status_pd.loc[i, 1] = 1 # success
                        return
                    elif (n_rule == len(rules)-1) & (k == len(finds_l)-1) & (mo.span(0)[0] <= span_pd.loc[i, 0][0]): # last mo
                        status_pd.loc[i, 1] = 2 # further checking
                        print(f'all match fail in filing_str_l[{i}]')
                        break # try next rule
            else:
                continue # try next rule


In [1192]:
update_tasks_e = [find_item2_end(i) for i in range(len(filing_str_l[:300]))]

In [1193]:
with ProgressBar():
    dask.compute(update_tasks_e, num_workers=cpu_core*2)

[                                        ] | 0% Completed |  0.4signore258
ignore25
[#############                           ] | 34% Completed |  1.6signore107
[###############                         ] | 39% Completed |  1.8signore131
[####################                    ] | 50% Completed |  2.5signore105
[########################                ] | 60% Completed |  3.1signore252
[############################            ] | 70% Completed |  3.4signore88
[########################################] | 100% Completed |  4.4s


In [1194]:
span_pd.head(10)

Unnamed: 0,0,1
0,"(28769, 28883)","(45462, 45542)"
1,"(48635, 48732)","(75427, 75512)"
2,"(28666, 28768)","(42607, 42683)"
3,"(20665, 20758)","(35146, 35222)"
4,"(10159, 10257)","(25283, 25354)"
5,"(88751, 88856)","(185870, 185948)"
6,"(72565, 72662)","(134006, 134077)"
7,"(47692, 47805)","(89147, 89233)"
8,"(30082, 30183)","(58277, 58351)"
9,"(26482, 26589)","(55222, 55320)"


In [1195]:
status_pd.head(10)

Unnamed: 0,0,1
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1


# Export Results  

In [1196]:
status_pd['pd_idx'] = pd_idx_l

In [1197]:
pd_10Q.head(2)

Unnamed: 0,pd_10q_idx,cik,filing_date,fye,sic,ffind,file_name,n_words,n_unique_words,org_index,company_name,file_hash,file_hash_count
0,0,763950,2016-01-04,20150930,3823,37,10-X_C_2016-2018/2016/QTR1/20160104_10-Q_edgar...,8127,1144,943316,UNIVERSAL DETECTION TECHNOLOGY,0001078782-16-002115_3,1
1,1,62234,2016-01-05,20151126,7830,7,10-X_C_2016-2018/2016/QTR1/20160105_10-Q_edgar...,10023,1343,943326,MARCUS CORP,0001144204-16-074058_1,1


In [1198]:
# get filing url
pd_10Q['edgar_url'] = ""
for i, row in pd_10Q.iterrows():
    pd_10Q.loc[i, 'edgar_url'] = get_edgar_link(row['file_name'], False)

In [1199]:
pd_10Q[pd_10Q.filing_date.dt.year==2018].head(3)

Unnamed: 0,pd_10q_idx,cik,filing_date,fye,sic,ffind,file_name,n_words,n_unique_words,org_index,company_name,file_hash,file_hash_count,edgar_url
39911,39911,1392694,2018-01-02,20170331,7310,34,10-X_C_2016-2018/2018/QTR1/20180102_10-Q_edgar...,7864,1110,1001185,"Surge Holdings, Inc.",0001493152-18-000030_1,1,https://www.sec.gov/Archives/edgar/data/139269...
39912,39912,1606364,2018-01-02,20171031,1000,28,10-X_C_2016-2018/2018/QTR1/20180102_10-Q_edgar...,10664,1547,1001186,GARMATEX HOLDINGS LTD.,0001062993-17-005393_1,1,https://www.sec.gov/Archives/edgar/data/160636...
39913,39913,1619227,2018-01-02,20170930,7370,34,10-X_C_2016-2018/2018/QTR1/20180102_10-Q_edgar...,6963,1009,1001187,"Cloudweb, Inc.",0001640334-18-000029_1,1,https://www.sec.gov/Archives/edgar/data/161922...


In [1200]:
meta_fname = f'{data_fp}/Extracted/{which_year}/Extracted_Meta.csv'
it2_file_init = f'{data_fp}/Extracted/{which_year}/'

In [1237]:
def save_item2(fp, pd_10Q, filing_str_l):
    if os.path.exists(f'{data_fp}/Extracted/{which_year}/'):
        pass
    else:
        os.makedirs(f'{data_fp}/Extracted/{which_year}/')
    print("saving item2 txt files .......")
    subpd = pd_10Q[pd_10Q.filing_date.dt.year == which_year].reset_index()
    idx_start = subpd.loc[0, 'pd_10q_idx']
    idx_end = subpd.shape[0] + idx_start
    for i in tqdm(range(idx_start, idx_end)): # number of 10q for a year
        j = idx_d[pd_10Q.loc[i, 'pd_10q_idx']]
        if (status_pd.iloc[j, 0] in [1, 2]) & (status_pd.iloc[j, 1] in [1, 2]): # start and end both success
            file_hash = pd_10Q['file_hash'][i]
            start = span_pd.iloc[j, 0][0]
            end = span_pd.iloc[j, 1][0]
            doc_str = filing_str_l[j][start:end]
            doc_str = re.sub('\s+', ' ', doc_str)
            if len(doc_str) > 200:
                with open(it2_file_init+f"item2_{file_hash}.txt", "w") as f:
                    f.write(doc_str)
                