In [1]:
import urllib.request
from spacy import displacy
import spacy

# now import doctable
import sys
sys.path.append('..')
import doctable as dt

In [2]:
import urllib.request
def download_nss(
	baseurl='https://raw.githubusercontent.com/devincornell/nssdocs/master/docs/',
	years = (1987, 1988, 1990, 1991, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2002, 2006, 2010, 2015, 2017)
	):
	def read_url(url):
		return urllib.request.urlopen(url).read().decode('utf-8')
	
	ftemp = baseurl+'{}.txt'
	all_texts = [read_url(ftemp.format(year)) for year in years]
	return {yr:text for yr,text in zip(years,all_texts)}
nss_texts = download_nss()
len(nss_texts), nss_texts[2017][:50]

(17, 'An America that is safe, prosperous, and free at h')

## Tokenize Full Document
In this example, we store each nss document as sequence of tokens, parsed by DocParser. First I show a simple tokenization process, then I'll make a DocTable with a schema and store documents there by processing in parallel.

In [8]:
# first just show tokenizing
nlp = spacy.load('en', disable=['tagger','ner', 'parser'])
doc = nlp(nss_texts[2017]) # use spacy to process the Trump nss
toks = dt.DocParser.tokenize_doc(doc)
toks[:10]

['an', 'america', 'that', 'is', 'safe', ',', 'prosperous', ',', 'and', 'free']

In [22]:
# now show full stack tokenization, with custom functions for processing ents 
nlp = spacy.load('en', disable=['tagger','parser'])
doc = nlp(nss_texts[2017]) # use spacy to process the Trump nss

# create custom methods for tokenization (note that we use all the default kwargs anyways)
#def use_tok(tok): return dt.DocParser.use_tok(tok, replace_xml=None) #decides to use token or not
def ent_convert(tok): # for entities that are numbers, just replace with NUM
    return tok.text.upper()

# decides how to parse tokens
def parse_tok(tok): return dt.DocParser.parse_tok(tok, replace_num='NUM', ent_convert=ent_convert) 
#def parse_tok(tok): return (tok.text,tok.like_num)
toks = dt.DocParser.tokenize_doc(doc, parse_tok_func=parse_tok)
print(toks[:100])

['an', 'AMERICA', 'that', 'is', 'safe', ',', 'prosperous', ',', 'and', 'free', 'at', 'home', 'is', 'an', 'AMERICA', 'with', 'the', 'strength', ',', 'confi', 'dence', ',', 'and', 'will', 'to', 'lead', 'abroad', '.', 'it', 'is', 'an', 'AMERICA', 'that', 'can', 'preserve', 'peace', ',', 'uphold', 'liberty', ',', 'and', 'create', 'enduring', 'advantages', 'for', 'the', 'AMERICAN', 'people', '.', 'putting', 'AMERICA', 'FIRST', 'is', 'the', 'duty', 'of', 'our', 'government', 'and', 'the', 'foundation', 'for', 'U.S.', 'leadership', 'in', 'the', 'world', '.', 'a', 'strong', 'AMERICA', 'is', 'in', 'the', 'vital', 'interests', 'of', 'not', 'only', 'the', 'AMERICAN', 'people', ',', 'but', 'also', 'those', 'around', 'the', 'world', 'who', 'want', 'to', 'partner', 'with', 'THE', 'UNITED', 'STATES', 'in', 'pursuit', 'of']


In [23]:
# now create doctable for storing the docs
class NSSDocsTokens(dt.DocTable):
    '''For storing documents as BoW token sequences.'''
    schema = (
        ('integer', 'id', dict(primary_key=True, autoincrement=True)),
        ('integer', 'year'), # year when nss document was produced
        ('integer', 'num_tokens'),
        ('pickle','tokens'), # nested tokens within sentences within paragraphs
        ('index', 'ind_year', ['year'], dict(unique=True)),
    )
    def __init__(self,**kwargs):
        super().__init__(schema=self.schema, tabname='nsstokens', **kwargs)
dbtok = NSSDocsTokens(fname='exdb/nss_docs_tokens.db')
dbtok

<DocTable::nsstokens ct: 0>

In [30]:
# first insert manually
nlp = spacy.load('en', disable=['tagger','parser'])
yrs, texts = list(zip(*nss_texts.items()))

for yr, doc in zip(yrs, nlp.pipe(texts)):
    toks = dt.DocParser.tokenize_doc(doc)
    tbtok.insert({
        'year': yr,
        'num_tokens':len(toks),
        'tokens': toks
    })

[(1987,
  1988,
  1990,
  1991,
  1993,
  1994,
  1995,
  1996,
  1997,
  1998,
  1999,
  2000,
  2002,
  2006,
  2010,
  2015,
  2017),
  "My fellow Americans, \n\nAmerica is at war. This is a wartime national security strategy required by the grave challenge we face  the rise of terrorism fueled by an aggressive ideology of hatred and murder, fully revealed to the American people on September 11, 2001. This strategy reflects our most solemn obligation: to protect the security of the American people. \n\nAmerica also has an unprecedented opportunity to lay the foundations for future peace. The ideals that have inspired our history  freedom, democracy, and human dignity  are increasingly inspiring individuals and nations throughout the world. And because free nations tend toward peace, the advance of liberty will make America more secure. \n\nThese inseparable priorities  fighting and winning the war on terror and promoting freedom as the alternative to tyranny and despair  have now gu

<DocTable::nsstokens ct: 0>

In [12]:



# make a new doctable object
class NSSDocsParseTrees(dt.DocTable):
    schema = (
        ('integer', 'id',,dict(primary_key=True, autoincrement=True)),
        ('source', 'string'), # trump or obama - whichever nss document
        ('parid', 'integer'), # paragraph id
        ('sents','pickle'), # contains token parsetrees
        ('ind_src_par', 'index', ('source','parid'),dict(unique=True)),
    )
    def __init__(self,**kwargs):
        super().__init__(schema=self.schema, tabname='nsspars', **kwargs)
        
class NSSDocsTokens(dt.DocTable):
    schema = (
        ('integer', 'id',,dict(primary_key=True, autoincrement=True)),
        ('integer', 'year', dict(unique=True)), # trump or obama - whichever nss document
        ('integer', 'num_tokens'),
        ('integer','num_paragraphs'),
        ('integer','num_sentences'),
        ('pickle','par_sent_tokens'), # nested tokens within sentences within paragraphs
        ('ind_src_par', 'index', ('source','parid'),dict(unique=True)),
    )
    def __init__(self,**kwargs):
        super().__init__(schema=self.schema, tabname='nsspars', **kwargs)
        
        
        
class NSSParsParser(NSSPars, dt.DocParser):
    def __init__(self, *args, **kwargs):
        
        # create parser and matcher for parsing
        self.nlp = spacy.load('en')
        self.matcher = Matcher(nlp.vocab)
        
        # matches hyphens
        pattern = [{'IS_SPACE':False},{'TEXT':'-'},{'IS_SPACE':False}]
        self.matcher.add('hyphens', None, pattern)
        
        # matches hyphens
        pattern = [{'TEXT':'@'},{'IS_ALPHA':True}]
        self.matcher.add('handles', None, pattern)
        
        # doctable init
        super().__init__(*args, **kwargs)
    
    def insert_document(self, sourcename, text):
        # because distribute_parse takes a list of texts, we wrap in a list but
        # provide a paragraph separator so that it will process paragraphs in
        # parallel
        #doc = self.distribute_parse([text], self.nlp, paragraph_sep='\n\n')[0]
        paragraphs = self.distribute_parse(texts, self.nlp, parsefunc=self.parsetree_tokenize, preprocessfunc=None, 
            paragraph_sep='\n\n', n_cores=4, verbose=False)[0]
        
        valid_sents = list()
        i = 0
        for par in paragraphs:
            if len(par) > 0:
                # filter out sentences with no tokens
                sents = [s for s in par if len(s) > 0]
        
                # insert into table
                self.insert({'source': source, 'sents':sents, 'parid':i})
                i += 1
        
    @classmethod
    def parsetree_tokenize(cls,doc):
        info = {'lemma':cls.lemmainfo, 'prefix':cls.prefixinfo}
        return cls.get_parsetrees(doc, tok_parse_func=None, info_func_map=info, merge_ents=True, 
            spacy_ngram_matcher=cls.matcher, merge_noun_chunks=False)
        
    @staticmethod
    def lemmainfo(tok):
        return tok.lemma_
    @staticmethod
    def prefixinfo(tok):
        return tok.prefix_
    
    @classmethod
    def tokparser(cls,tok):
        return cls.parse_tok(tok, replace_num=True, replace_digit=None, lemmatize=False, normal_convert=None, 
            format_ents=True, ent_convert=None)
    
    @classmethod
    def include_tok(cls,tok):
         return cls.use_tok(tok, filter_whitespace=True, filter_punct=False, filter_stop=False, 
                      filter_digit=False, filter_num=False, filter_all_ents=False, 
                      filter_ent_types=tuple())
    

trump, obama = texts    
db = NSSParsParser()
db.insert_document('trump', trump)
db.insert_document('obama', obama)

Exception during reset or similar
Traceback (most recent call last):
  File "/home/utopia3/dc326/local/anaconda3/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 680, in _finalize_fairy
    fairy._reset(pool)
  File "/home/utopia3/dc326/local/anaconda3/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 867, in _reset
    pool._dialect.do_rollback(self)
  File "/home/utopia3/dc326/local/anaconda3/lib/python3.6/site-packages/sqlalchemy/engine/default.py", line 530, in do_rollback
    dbapi_connection.rollback()
sqlite3.ProgrammingError: SQLite objects created in a thread can only be used in that same thread. The object was created in thread id 140366570628928 and this is thread id 140364765624064.
Exception closing connection <sqlite3.Connection object at 0x7fa9272b1490>
Traceback (most recent call last):
  File "/home/utopia3/dc326/local/anaconda3/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 680, in _finalize_fairy
    fairy._reset(pool)
  File "/home/uto

AttributeError: type object 'NSSParsParser' has no attribute 'matcher'