from IPython.parallel import Client
IP_client = Client()
IP_view = IP_client.load_balanced_view()

In [None]:
# Import 
%run header.ipynb
%run datascience.ipynb
%run D1A.ipynb
%matplotlib inline

import shutil
from bs4 import BeautifulSoup

from pyedgar.utilities import plaintext
from pyedgar.utilities import forms
from pyedgar.utilities import edgarweb
from pyedgar.utilities import localstore
from pyedgar.utilities import htmlparse
from pyedgar.exceptions import *

import panda_cub
panda_cub.monkey_patch_pandas()
panda_cub.monkey_patch_seaborn()

In [None]:
os.chdir(FEED_ROOT)

In [2]:
df = pd.read_csv('/data/D1A/blobs.k.2016-07-24.csv.scrape')

In [3]:
len(df.cik.unique())

6259

# Output methods

In [4]:
ROOT = {c:os.path.join(DATA_ROOT, c+'1A') for c in 'K Q'.split()}
FORM = 'K'
FILENAME_TMPLT = os.path.join(DATA_ROOT, 'blobs.{}.{:%Y-%m-%d}.csv')
FILENAME_TMP = lambda: FILENAME_TMPLT.format(FORM.lower(), dt.date.today())

In [5]:
def statout(tmp):
    tot = len(tmp)/100
    f = "{:<23}{: >7,d}  {:2.0f}%"
    print(f.format("Total:",len(tmp), 100))
    print(f.format("Number found orders:",len(tmp[tmp.match_score > 0]), len(tmp[tmp.match_score > 0])/tot))
    print(f.format("Number missing orders:",len(tmp[tmp.match_score <= 0]), len(tmp[tmp.match_score <= 0])/tot))
    tot = len(tmp.cik.unique())/100
    print(f.format("Number of CIKs:",len(tmp.cik.unique()), 100))
    print(f.format("Number of CIKs found:",len(tmp[tmp.match_score > 0].cik.unique()), len(tmp[tmp.match_score > 0].cik.unique())/tot))
    print(f.format("Number of CIKs missing:",len(tmp[tmp.match_score <= 0].cik.unique()), len(tmp[tmp.match_score <= 0].cik.unique())/tot))

# Load 10-K list

In [8]:
if "Compustat":
    df_comp = pd.read_csv(os.path.join(SHARED_DATA_ROOT, 'funda.csv'))
    for c in 'datadate rdq comp_start'.split():
        df_comp[c] = pd.to_datetime(df_comp[c])
    
    df_comp['comp_age'] = (df_comp['datadate'] - df_comp['comp_start']) / TD_DAY

    df_comp['sich'] = df_comp.groupby('gvkey').sich.fillna(method='ffill')
    df_comp['sich'] = df_comp.groupby('gvkey').sich.fillna(method='bfill')
    
    df_comp.winsor("""at lt prcc_f cshpri txditc invt ppent pi ni sale
                   re act lct csho xrd ajex oibdp oancf wcap dp ib
                   xint pstk teq dlc dltt""".split(), inplace=True)

    print("Num: {:,}\tStart: {:%Y-%m-%d}\tEnd: {:%Y-%m-%d}".format(
          len(df_comp), df_comp.datadate.min(), df_comp.datadate.max(), ))
    print("Num 10-Ks post 05: {:,d}\tCIKs: {:,d}".format(len(df_comp[df_comp.fyear >= 2005]),
                                                         len(df_comp[df_comp.fyear >= 2005].cik.unique())))

Num: 237,720	Start: 1991-06-30	End: 2015-09-30
Num 10-Ks post 05: 89,687	CIKs: 13,541


In [9]:
tens = pd.read_csv(os.path.join(IDX_ROOT, 'form_10.periods.csv'))
for c in 'filedate period'.split():
    tens[c] = pd.to_datetime(tens[c])
tens = (tens[tens.filedate >= dt.date(2005,1,1)]
            .sort_values('cik filedate'.split())
            .reset_index(drop=True))
sel = tens.form.str.contains('K')
print('Forms: {}\tCIKs: {}'.format(tens[sel].cik.count(),len(tens[sel].cik.unique())))
keepciks = df_comp[df_comp.fyear >= 2005].cik.unique()
sel &= tens.cik.isin(keepciks)
print('Forms: {}\tCIKs: {}'.format(tens[sel].cik.count(),len(tens[sel].cik.unique())))
linkhead(tens)

Forms: 101982	CIKs: 23252
Forms: 67648	CIKs: 9632


cik,name,form,filedate,accession,period,sic,ff12_name,ff12,ff48,ff48_name,links
20,K TRON INTERNATIONAL INC,10-K,2005-03-31,0000893220-05-000728,2005-01-01,3823,BusEq,6,37,LabEq,FTPHTML
20,K TRON INTERNATIONAL INC,10-Q,2005-05-05,0000893220-05-001038,2005-04-02,3823,BusEq,6,37,LabEq,FTPHTML
20,K TRON INTERNATIONAL INC,10-Q,2005-08-03,0000893220-05-001794,2005-07-02,3823,BusEq,6,37,LabEq,FTPHTML
20,K TRON INTERNATIONAL INC,10-Q,2005-10-31,0000893220-05-002440,2005-10-01,3823,BusEq,6,37,LabEq,FTPHTML
20,K TRON INTERNATIONAL INC,10-K,2006-03-23,0000893220-06-000650,2005-12-31,3823,BusEq,6,37,LabEq,FTPHTML


# Load 10-K/Q dataframes

In [8]:
existing_files = [os.path.basename(f) for f in localstore.walk_files(ROOT[FORM])]
df_files = pd.DataFrame([x[:-5].split('_') + [x] for x in existing_files],
                        columns='cik fdate accession path'.split())
df_files['cik'] = df_files.cik.astype(int)
del df_files['fdate']
linkhead(df_files, n=1)

cik,accession,path,links
1053691,0001104659-15-025785,0001053691_2015-04-03_0001104659-15-025785.html,FTPHTML


In [15]:
df = tens[tens.form.str.contains(FORM)]
df = df.merge(pd.read_csv(os.path.join(DATA_ROOT, 'blobs.{}.csv'.format(FORM.lower())),
                          usecols='cik accession starts ends'.split()),
              on='cik accession'.split(), how='outer', suffixes=('', '_old'))
df = df.merge(df_files, on='cik accession'.split(), how='left')
df.ix[df.path.isnull(), 'starts ends'.split()] = np.nan
print("# of 10-Qs to search:", len(df))

if os.path.exists(os.path.join(DATA_ROOT, 'all_data_merge.csv')):
    _tmpdf = pd.read_csv(os.path.join(DATA_ROOT, 'all_data_merge.csv'),
                         usecols='gvkey cik permno num_headers atq mcap'.split())
    _sel = _tmpdf['gvkey cik permno'.split()].notnull().all(axis=1)
    _sel = _tmpdf[_sel].cik.unique()
    
    df = df[df.cik.isin(_sel)]
    print("# of filtered 10-Qs to search:", len(df))

tmp = len(tens[tens.cik.isin(_sel)]), len(df), sum((df.starts==1) & (df.ends==1))
print("10s: {}\nKs:  {} ({:2.0f}%)\nFound: {} ({:2.0f}%)\nMissing: {} ({:2.0f}%)"
      .format(tmp[0], tmp[1], tmp[1]/tmp[0]*100, 
              tmp[2], tmp[2]/tmp[1]*100, tmp[1]-tmp[2], (tmp[1]-tmp[2])/tmp[1]*100))
linkhead(df, n=1)

# of 10-Qs to search: 101982
# of filtered 10-Qs to search: 54447
10s: 212659
Ks:  54447 (26%)
Found: 38511 (71%)
Missing: 15936 (29%)


cik,name,form,filedate,accession,period,sic,ff12_name,ff12,ff48,ff48_name,starts,ends,path,links
20,K TRON INTERNATIONAL INC,10-K,2005-03-31,0000893220-05-000728,2005-01-01,3823,BusEq,6,37,LabEq,,,,FTPHTML


# Helper functions

In [16]:
RE_item = re.compile('^(?:\s|</?[a-z][^>]*>|&nbsp;)*'
                     'item(?:[^a-z0-9]|&nbsp;)*(1[AB]|2)', re.I)

RE_1A = re.compile('^[^a-z0-9\r\n]*item[^a-z0-9]*1A[^a-z]*risk[^a-z]*factors?', re.I)
RE_1A_clean = re.compile('^\s*item[^a-z0-9]*1A[^a-z]*risk[^a-z]*factors?\s*$', re.I)
RE_1A_toc = re.compile('^\s*item[^a-z0-9]*1A[^a-z]*risk[^a-z]*factors?[^a-z0-9]*\d+[^a-z0-9]*$', re.I)
# RE_1A_clean = re.compile('^\s*item\s+1A\s+risk\s*factors?\s*$', re.I)
# RE_1A_toc = re.compile('^\s*item\s+1A\s+risk\s*factors?\s*\d+\s*$', re.I)
RE_1B = re.compile('^[^a-z0-9\r\n]*item[^a-z0-9]*1B[^a-z]*(?:(?:unresolved|sec|staff|comments)[^a-z]*)+$', re.I)
RE_2 = re.compile('^\s*item[^a-z0-9]*2.{,20}propert(?:y|ies)\D{,35}$', re.I)
RE_1B2_toc = re.compile('^\s*item\s+(?:1B|2)?\s+.{1,40}\d+\s*$', re.I)

re_clean = lambda x: re.compile('[^a-z0-9]+', re.I).sub(' ', x)

In [18]:
EMPH_TAGS = "b em strong h1 h2 h3 h4 h5 h6 u".split()
FACE_TAGS = "p font div span li".split()
EMPH_STYLE = re.compile(r"bold|underline", re.I)
DIVIDER_STYLES = 'h1 h2 h3 h4 h5 h6 p div ul ol tr table'.split()

def find_items(soup, debug=False):
    collect_1as = []
    collect_next = []
    for tag in soup.find_all(text=RE_item):
        emphasized_tag = False
        for parent_tag in tag.parents:
            if parent_tag.name.lower() in EMPH_TAGS:
                if debug: print('Emphasized {!r}'.format(tag))
                emphasized_tag = True
            elif (parent_tag.name.lower() in FACE_TAGS
                  and EMPH_STYLE.search(parent_tag.attrs.get('style', ''))):
                if debug: print('Emphasized {!r}'.format(tag))
                emphasized_tag = True
            if emphasized_tag: # Bold. Stop searching parents.
                break
        if not emphasized_tag:
            continue
        for parent_tag in tag.parents:
            if parent_tag.name.lower() in DIVIDER_STYLES:
                if debug: print('Container {} for {!r}'.format(parent_tag.name, tag))
                break
        else:
            parent_tag = None
        if parent_tag is None:
            if debug: print("Parent tag wasn't container of {!r}".format(tag))
            continue
        full_text = re_clean(parent_tag.get_text())
        if debug: print("Full text of {!r} is {!r}".format(tag, full_text))
        if RE_1A.search(full_text):
            collect_1as.append(parent_tag)
            parent_tag.insert_before(soup.new_tag('STARTHERE'))
            if debug: print("Inserted start before 1A: {}".format(str(parent_tag)[:50]))
        elif RE_1B.search(full_text) or (RE_2.search(full_text) and not collect_next):
            collect_next.append(parent_tag)
            parent_tag.insert_after(soup.new_tag('ENDHERE'))
            if debug: print("Inserted end after {}".format(str(parent_tag)[:50]))
    return collect_1as, collect_next

In [27]:
def get_parent_names(tag):
    return [_.name.lower() for _ in tag.parents if _.name not in ('body', 'html', '[document]')]

In [19]:
def find_items(soup, debug=False):
    collect_1as = []
    collect_next = []
    for tag in soup.find_all(text=RE_item):
        if debug: print(tag)
        for parent_tag in tag.parents:
            if parent_tag.name.lower() in EMPH_TAGS \
               or (parent_tag.name.lower() in FACE_TAGS
                   and EMPH_STYLE.search(parent_tag.attrs.get('style', ''))):
                if debug: print('\tEmphasized {!r}'.format(tag))
                # Then ITEM is emphasized. Break and skip the continue at the end of this loop.
                break
        else: continue

        # Evaluates to true if there is no TR tag present
        has_tr = 'tr' in get_parent_names(tag)
        first_parent, row_parent = None, None
        for parent_tag in tag.parents:
            if parent_tag.name.lower() in DIVIDER_STYLES:
                if debug: print('\tContainer {} for {!r}'.format(parent_tag.name, tag))
                if first_parent is None:
                    first_parent = parent_tag
                    if not has_tr or parent_tag.name.lower() == 'tr':
                        break
                elif parent_tag.name.lower() == 'tr':
                    row_parent = parent_tag
                    break
        else:
            # No break, no dividers found. Mostly an error case.
            if debug: print("\tParent tag wasn't container of {!r}".format(tag))
            continue
        
        # At this point parent_tag ISN'T necessarily correct.
        full_text = re_clean(first_parent.get_text())
        if row_parent:
            row_full_text = re_clean(row_parent.get_text())
            # If parent does match but row matches TOC, then this tag
            if any([bool(r.search(full_text)) for r in (RE_1A_clean, RE_1B, RE_2)]):
                if RE_1A_toc.search(row_full_text) or RE_1B2_toc.search(row_full_text):
                    # Then we're in a table of contents. Continue
                    if debug: print("\t>Parent matched item but row matched TOC: {!r}"
                                    .format(row_full_text))
                    continue
                # otherwise the row parent doesn't matter. Set parent tag to first_parent
                parent_tag = first_parent
            # If parent doesn't match and row does, use row
            elif any([bool(r.search(row_full_text)) for r in (RE_1A_clean, RE_1B, RE_2)]):
                # Then parent tag is row_parent
                if debug: print("\t>Parent didn't match item but row did: {!r}"
                                .format(row_full_text))
                parent_tag = row_parent
                full_text = row_full_text
            # Otherwise continue to next tag.
            else:
                if debug: print("\t>Neither parent nor row matched")
        else:
            parent_tag = first_parent
        
        # Now parent_tag should be the correct parent (if not, the code above continued)
        if debug: print("\tFull text of {!r} is {!r}".format(tag, full_text))

        if RE_1A.search(full_text) and not RE_1A_toc.search(full_text):
            collect_1as.append(parent_tag)
            parent_tag.insert_before(soup.new_tag('STARTHERE'))
            if debug: print("!\tInserted start before 1A: {!r}".format(str(parent_tag)[:50]))
        elif RE_1B.search(full_text) or (RE_2.search(full_text) and not collect_next):
            collect_next.append(parent_tag)
            parent_tag.insert_after(soup.new_tag('ENDHERE'))
            if debug: print("!\tInserted end after {!r}".format(str(parent_tag)[:50]))
    return collect_1as, collect_next

# Iterate and search 10-Ks

In [21]:
OUT_FPATTERN = os.path.join(ROOT[FORM], '{cik:010d}_{filedate:%Y-%m-%d}_{accession}.html');OUT_FPATTERN

'/data/D1A/K1A/{cik:010d}_{filedate:%Y-%m-%d}_{accession}.html'

with open("/home/gaulinmp/Desktop/new_files.txt", 'w') as fh:
    fh.write('Start!\n')

In [22]:
def res_to_df(results):
    _df = (pd.DataFrame.from_dict(results, orient='index')
              ['starts ends'.split()]
              .reset_index())
    _df['filename'] = _df['index'].apply(lambda x: x.split('/')[-1])
    _df['cik'] = _df['filename'].apply(lambda x: int(x.split('_')[0]))
    _df['accession'] = _df['filename'].apply(lambda x: x.split('_')[-1][:-5])
    return _df

## Main Loop

In [24]:
BAD_STARTEND = [None,]*99
tmpdf = df[~(df.starts > 0)]
results = {}

In [28]:
DEBUG = False

try:
    for i,row in tqdm(tmpdf.iterrows(), total=len(tmpdf)):
        src_fname = localstore.get_filing_path(row.cik, row.accession)
        if not os.path.exists(src_fname):
            continue
        
        dest_fname = OUT_FPATTERN.format(**row)

        if (os.path.exists(dest_fname)
            or dest_fname in results):
            continue
        # Checks done, we have not done this form yet
        results[dest_fname] = {}
        
        try:
            rtxt = forms.get_form(src_fname)
        except EDGARFilingFormatError:
            continue
        if not htmlparse.RE_HTML_TAGS.findall(rtxt, 0, 500):
            # Not HTML
            continue

        soup = BeautifulSoup(rtxt, 'lxml')

        starts, ends = find_items(soup, debug=DEBUG)
        istart,iend = -1, -1

        if len(starts) == 1 and len(ends) == 1:
            try:
                prtxt = str(soup)
            except RuntimeError:
                # Some forms are so bad that beautiful soup craps out
                prtxt = ''
                # below we save len(starts/ends), which will be 99 if bad
                starts, ends = BAD_STARTEND, BAD_STARTEND
            
            istart,iend = prtxt.find('<STARTHERE>'), prtxt.find('</ENDHERE>')
            
            
            if istart > 0 and iend > 0:
                onea = prtxt[istart:iend+len('</ENDHERE>')]

                with open(dest_fname, 'w') as fh:
                    fh.write(onea)
            
                with open("/home/gaulinmp/Desktop/new_files.txt", 'a') as fh:
                    fh.write(dest_fname)
                    fh.write('\n')
        
        # Update the results with new starts and ends
        results[dest_fname].update({'starts':len(starts), 'ends':len(ends)})

        if DEBUG and i > 20: break
finally:
    res_to_df(results).to_csv(FILENAME_TMP()+'.scrape', index=False)




In [31]:
if "Add results to DataFrame":
    df_new = res_to_df(results)
    del df_new['index']

    df_new = df.merge(df_new, on='cik accession'.split(), how='outer', suffixes=('', '_new'))
    for c in 'starts ends'.split():
        df_new[c] = df_new[[c, c+'_new']].max(axis=1)
        del df_new[c+'_new']

if "Copy New Results to blobs (and backup old)":
    blob_fnam = os.path.join(DATA_ROOT, 'blobs.{}.csv'.format(FORM.lower()))

    if os.path.exists(blob_fnam):
        _t = dt.datetime.fromtimestamp(os.path.getmtime(blob_fnam))
        backup_fnam = os.path.join(DATA_ROOT, 
                                   'blobs.{}.{:%Y-%m-%d}.csv'.format(FORM.lower(), _t))
        if not os.path.exists(backup_fnam):
            shutil.copy2(blob_fnam, backup_fnam)

    (df_new['cik name form filedate accession period starts ends'.split()]
            .to_csv(FILENAME_TMP(), index=False))
    (df_new['cik name form filedate accession period starts ends'.split()]
            .to_csv(blob_fnam, index=False))

if "Summarize":
    tmp = len(tens[tens.cik.isin(_sel)]), len(df_new), sum(df_new.starts>0)
    print("10s: {}\nKs:  {} ({:2.0f}%)\nFound: {} ({:2.0f}%)\nMissing: {} ({:2.0f}%)"
          .format(tmp[0], tmp[1], tmp[1]/tmp[0]*100, 
                  tmp[2], tmp[2]/tmp[1]*100, tmp[1]-tmp[2], (tmp[1]-tmp[2])/tmp[1]*100))

10s: 212659
Ks:  54447 (26%)
Found: 43432 (80%)
Missing: 11015 (20%)


cik,name,form,filedate,accession,period,sic,ff12_name,ff12,ff48,ff48_name,starts,ends,path,filename,links
20,K TRON INTERNATIONAL INC,10-K,2005-03-31,0000893220-05-000728,2005-01-01,3823,BusEq,6,37,LabEq,,,,,FTPHTML
20,K TRON INTERNATIONAL INC,10-K,2006-03-23,0000893220-06-000650,2005-12-31,3823,BusEq,6,37,LabEq,1.0,1.0,0000000020_2006-03-23_0000893220-06-000650.html,,FTPHTML
20,K TRON INTERNATIONAL INC,10-K,2007-03-09,0000893220-07-000678,2006-12-30,3823,BusEq,6,37,LabEq,1.0,1.0,0000000020_2007-03-09_0000893220-07-000678.html,,FTPHTML


# One off tester

Bad CIKS: 2186, 

In [227]:
linkhead(df[(df.starts == 1) & df.path.isnull() & (df.cik > 4447)].sort_values('cik datadate'.split()))

permno,gvkey,cik,datadate,fyearq,fyr,fqtr,form,filedate,period,accession,starts,ends,path,links
11144,1585,5981,2005-12-31,2005,12,4,10-K,2006-03-16,2005-12-31,0001193125-06-056761,1,1,,FTPHTML
38746,1618,6207,2014-04-30,2013,4,4,10-K,2014-07-29,2014-04-30,0001144204-14-045520,1,1,,FTPHTML
14219,1634,6292,2007-12-31,2007,12,4,10-K,2008-03-05,2007-12-29,0000006292-08-000018,1,0,,FTPHTML
63503,2870,18808,2010-12-31,2010,12,4,10-K,2011-03-15,2010-12-31,0000018808-11-000007,1,0,,FTPHTML
63503,2870,18808,2011-12-31,2011,12,4,10-K,2012-03-14,2011-12-31,0001140361-12-015364,1,0,,FTPHTML


In [228]:
linkhead(df[df.cik == 18808], n=15)

permno,gvkey,cik,datadate,fyearq,fyr,fqtr,form,filedate,period,accession,starts,ends,path,links
63503,2870,18808,2005-12-31,2005,12,4,10-K,2006-03-31,2005-12-31,0000018808-06-000043,,,,FTPHTML
63503,2870,18808,2006-12-31,2006,12,4,10-K,2007-03-15,2006-12-31,0000018808-07-000013,0.0,0.0,,FTPHTML
63503,2870,18808,2007-12-31,2007,12,4,10-K,2008-03-11,2007-12-31,0000018808-08-000036,0.0,0.0,,FTPHTML
63503,2870,18808,2008-12-31,2008,12,4,10-K,2009-03-12,2008-12-31,0000018808-09-000006,1.0,1.0,0000018808_2009-03-12_0000018808-09-000006.html,FTPHTML
63503,2870,18808,2009-12-31,2009,12,4,10-K,2010-03-15,2009-12-31,0000018808-10-000006,1.0,1.0,0000018808_2010-03-15_0000018808-10-000006.html,FTPHTML
63503,2870,18808,2010-12-31,2010,12,4,10-K,2011-03-15,2010-12-31,0000018808-11-000007,1.0,0.0,,FTPHTML
63503,2870,18808,2011-12-31,2011,12,4,10-K,2012-03-14,2011-12-31,0001140361-12-015364,1.0,0.0,,FTPHTML


In [229]:
src_fname = localstore.get_filing_path(18808, '0000018808-08-000036')
src_fname

'/data/storage/edgar/feeds/00/00/01/88/08/0000018808-08-000036.txt'

In [230]:
rtxt = forms.get_form(src_fname)

In [231]:
txt = htmlparse.convert_html_to_text(rtxt)
len(rtxt), len(txt)

(647725, 471395)

In [232]:
soup = BeautifulSoup(rtxt, 'lxml')

In [234]:
starts, ends = find_items(soup, debug=True)
len(starts), len(ends)


Item 1A

Item 1B

Item 2.
Item 1A.   Risk Factors 
Item 1B.   Unresolved Staff Comments
Item 2.   Properties


(0, 0)

In [212]:
starts, ends

([<tr valign="top">
  <td>
  <b><font style="font-family: 'Times New Roman', Times">ITEM 1A. </font></b>
  </td>
  <td>
  <b><font style="font-family: 'Times New Roman', Times">RISK
      FACTORS</font></b>
  </td>
  </tr>], [<tr valign="top">
  <td>
  <b><font style="font-family: 'Times New Roman', Times">ITEM 1B. </font></b>
  </td>
  <td>
  <b><font style="font-family: 'Times New Roman', Times">UNRESOLVED
      SEC STAFF COMMENTS</font></b>
  </td>
  </tr>])

In [213]:
prtxt = soup.prettify()
istart,iend = prtxt.find('<STARTHERE>'),prtxt.find('</ENDHERE>')

In [214]:
if len(starts) == 1 and len(ends) == 1 and istart > 0 and iend > 0:
    onea = prtxt[istart:iend+len('</ENDHERE>')]
len(onea)

30535

In [None]:
display_html(onea, raw=True)

In [202]:
dest_fname

'/data/D1A/K1A/0000004187_2015-04-15_0001628280-15-002303.html'

In [177]:
df.describe()

Unnamed: 0,permno,gvkey,cik,fyearq,fyr,fqtr,starts,ends
count,42509.0,42509.0,42509.0,42509.0,42509.0,42509.0,41213.0,41213.0
mean,69759.845,72353.076,874091.315,2009.347,10.343,3.994,0.868,0.866
std,27230.666,68125.615,426812.811,2.951,3.177,0.127,0.676,0.367
min,10001.0,1004.0,20.0,2004.0,1.0,1.0,0.0,0.0
25%,58640.0,12519.0,751978.0,2007.0,10.0,4.0,1.0,1.0
50%,82508.0,31170.0,931148.0,2009.0,12.0,4.0,1.0,1.0
75%,89399.0,142382.0,1135185.0,2012.0,12.0,4.0,1.0,1.0
max,93436.0,317264.0,1629210.0,2015.0,12.0,4.0,25.0,2.0


# Now run it for full sample.

c = {}

fs = existing_files.copy()

i = 0
prog = ipywidgets.FloatProgress(min=0, max=len(fs))
display(prog)

while len(fs) > 0:
    f = fs.pop(0)
    i += 1
    if not i%100:
        prog.value = i
    p = os.path.join(DATA_ROOT, 'K1A/', f)
    with open(p) as fh:
        html = fh.read()
    s = BeautifulSoup(html, 'lxml')
    for tag in s.find_all(text=RE_item):
        for parent_tag in tag.parents:
            if parent_tag.name.lower() in DIVIDER_STYLES:
                break
        else:
            continue
        break
    else:
        continue
    t = re_clean(parent_tag.get_text()).strip().lower()
    if t not in c:
        c[t] = 0
    c[t]+=1
#     if i > 2000:
#         break

In [None]:
c