# Full Workflow Example
This example shows a full example of a doctable workflow designed to parse texts end-to-end.

In [1]:
import sys
sys.path.append('..')
import doctable as dt
import spacy
from pprint import pprint
import urllib.request # used for downloading nss docs

## Create a Parser Class
This class will be used to parse your entire corpus. It inherits from DocParser to use a number of built-in features to make parsing convenient. For maximum efficiency, it works by providing only a number of years.

In [70]:
class NSSDocs(dt.DocTable):
    tabname = 'nssdocs'
    schema = (
        ('integer', 'id', dict(primary_key=True, autoincrement=True)),
        ('integer', 'year', dict(unique=True)),
        ('integer','num_pars'),
        ('integer','num_sents'),
        ('integer', 'num_toks'),
        ('pickle','par_sents'), # nested tokens within sentences within paragraphs
        ('index', 'ind_yr', ['year'], dict(unique=True)),        
    )
    def __init__(self, **kwargs):
        dt.DocTable.__init__(self, schema=self.schema, tabname=self.tabname, **kwargs)
        
    def insert_nssdoc(self, year, par_sents, **kwargs):
        self.insert({
            'year': year,
            'num_pars': len(par_sents),
            'num_sents': len([s for par in par_sents for s in par]),
            'num_toks': len([t for par in par_sents for s in par for t in s]),
            'par_sents': par_sents,
        }, **kwargs)

class NSSParser(dt.DocParser):
    
    def __init__(self, *args, **kwargs):
        self.nlp = spacy.load('en')
        
    def parse_nss_docs(self, years, dbfname, as_parsetree=False, workers=None):
        '''Parse and store nss docs into a doctable.
        Args:
            years (list): years to request from the nss corpus
        '''
        self.distribute_chunks(self.parse_nss_chunk, years, self.nlp, dbfname, as_parsetree, workers=workers)
    
    @classmethod
    def parse_nss_chunk(cls, years, nlp, dbfname, as_parsetree):
        '''Run in separate process for each chunk of nss years.'''
        
        # create a new database connection
        db = NSSDocs(fname=dbfname)
        
        # download, preprocess, and break texts into paragraphs
        preprocess = lambda text: cls.preprocess(text, replace_xml='')
        texts = list(map(preprocess, list(map(cls.download_nss, years))))
        pars = [(i,par.strip()) for i,text in enumerate(texts) 
                      for par in text.split('\n\n') if len(par.strip()) > 0]
        ind, pars = list(zip(*pars))
        
        use_tok = lambda tok: cls.use_tok(tok, filter_whitespace=True)
        parse_tok = lambda tok: cls.parse_tok(tok, replace_num=True, format_ents=True)
        
        # choose to create either token sequences or parsetrees
        if not as_parsetree:
            tokenize = lambda doc: cls.tokenize_doc(doc, merge_ents=True, split_sents=True, parse_tok_func=parse_tok, use_tok_func=use_tok)
        else:
            tokenize = lambda doc: cls.get_parsetrees(doc, merge_ents=True, parse_tok_func=parse_tok)
        
        print('starting', years)
        # process documents
        pp = list()
        for doc in nlp.pipe(pars):
            toks = tokenize(doc)
            pp.append(toks)
        print('about to insert', years)
        # merge paragraphs back into docs and insert into db
        doc_pars = [[p for idx,p in zip(ind,pp) if idx==i] for i in range(max(ind)+1)]
        for yr,dp in zip(years,doc_pars):
            db.insert_nssdoc(yr, dp, ifnotunique='replace')
        print('inserted', years)

            
    @staticmethod
    def download_nss(year):
        baseurl = 'https://raw.githubusercontent.com/devincornell/nssdocs/master/docs/{}.txt'
        url = baseurl.format(year)
        text = urllib.request.urlopen(url).read().decode('utf-8')
        return text
    

# download and parse these years
years = (1987, 1988, 1990, 1991, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2002, 2006, 2010, 2015, 2017)
    
# instantiate parser (primarily loads spacy parser)
parser = NSSParser()

fname = 'exdb/ex_workflow_tokens.db'
parser.parse_nss_docs(years, fname, as_parsetree=False, workers=4)
db = NSSDocs(fname=fname)
print(db)
print(db.select_df(limit=2))

fname = 'exdb/ex_workflow_parsetrees.db'
parser.parse_nss_docs(years, fname, as_parsetree=True, workers=4)
db = NSSDocs(fname=fname)

print(db)
print(db.select_df(limit=2))

starting (2015, 2017)
starting (1987, 1988, 1990, 1991, 1993)
starting (1999, 2000, 2002, 2006, 2010)
starting (1994, 1995, 1996, 1997, 1998)
about to insert (2015, 2017)
inserted (2015, 2017)
about to insert (1987, 1988, 1990, 1991, 1993)
inserted (1987, 1988, 1990, 1991, 1993)
about to insert (1999, 2000, 2002, 2006, 2010)
inserted (1999, 2000, 2002, 2006, 2010)
about to insert (1994, 1995, 1996, 1997, 1998)
inserted (1994, 1995, 1996, 1997, 1998)
<DocTable::nssdocs ct: 17>
    id  year  num_pars  num_sents  num_toks  \
0  155  2015       150        659     16108   
1  156  2017       400       1170     23587   

                                           par_sents  
0  [[[Today, ,, The United States, is, stronger, ...  
1  [[[an, America, that, is, safe, ,, prosperous,...  
starting (2015, 2017)
starting (1987, 1988, 1990, 1991, 1993)
starting (1994, 1995, 1996, 1997, 1998)
starting (1999, 2000, 2002, 2006, 2010)
about to insert (2015, 2017)
inserted (2015, 2017)
about to insert (19