In [8]:
from pprint import pprint
import re

from scipy.stats.mstats import spearmanr
import textacy

from colandr import create_app, db
from colandr.lib.nlp import reviewer_terms
from colandr.models import Citation, Review, ReviewPlan
from colandr.api.schemas import CitationSchema

In [3]:
app = create_app('default')

### reviewer terms tests

In [10]:
keyterms_scores = []
incl_excl_scores = []

with app.app_context():
    
    review = db.session.query(Review).get(1)
    review_plan = review.review_plan
    
    keyterms_regex = reviewer_terms.get_keyterms_regex(review_plan.keyterms)
    incl_regex, excl_regex = reviewer_terms.get_incl_excl_terms_regex(review_plan.suggested_keyterms)
    
    for i in range(1, 100):
        citation = db.session.query(Citation).get(i)
        text_content = citation.text_content
    
        keyterms_score = reviewer_terms.get_keyterms_regex_score(keyterms_regex, text_content)
        incl_excl_score = reviewer_terms.get_incl_excl_terms_score(incl_regex, excl_regex, text_content)
        
        keyterms_scores.append(keyterms_score)
        incl_excl_scores.append(incl_excl_score)
        
#         print('\n' + str(i))
#         print(keyterms_score)
#         print(incl_excl_score)

In [13]:
spearmanr(keyterms_scores, incl_excl_scores)

SpearmanrResult(correlation=-0.068977544592122658, pvalue=masked_array(data = 0.49751165378891526,
             mask = False,
       fill_value = 1e+20)
)

In [20]:
%%time

with app.app_context():
    
    results = db.session.query(Citation.text_content).filter_by(review_id=1)
    texts = (result[0] for result in results)
    corpus = textacy.Corpus('en', texts=texts)
    print(corpus)

Corpus(28708 docs; 8293029 tokens)
CPU times: user 8min 3s, sys: 1.75 s, total: 8min 5s
Wall time: 8min 6s


In [22]:
doc = corpus[10]
doc

Doc(228 tokens; "Place Perceptions and Controversies over Forest...")

In [23]:
doc.spacy_doc.to_bytes()

b'\xcb\x05\x00\x00m\xfa\xff\xffy\x14\x9e\xce<nMG7\xc6\x7f\xf2\'n\x14\xbfWdo=\xbcgdo\ty\xac{\xedf\xfc\xb9\xba\xf9~C\xeb\xa6;\xd1%\x16\xfeJ6sz\x04\xdfp\xd3\x9dx\xa6\xcb\xed\xeeB\x0c|\xf2j\x9c\xcay;\xd1%\xd6={\x9c+\xca\x19\x89t\xe1\xe9\xa4oMG7\xc6\x7f\xe2w\x8a\xdf+\xb2\xb7\x1e\xde3\xb2\xb7Lb\xddkg\xfe\\\xdd|\xbfa\xf3\xf9/\xa0n\x00Vn\x1c\xce\xc8_\xa6\xe9\x15<\r;L\x16\xcf\'\x7f.\xf5\xc3\'>\xc1;b\x87\xb7\x13]b\x9d<u\xac\xb7-}k:\xba1\xfe3\x12\xe9\xc2\xd3=\xaaI\xac{\xfd\xe4O\xa4\xa3\xe0T\xc3B\x0c|\xe2s\x8ay\x8f\xff\x88)\xf6\x90\xbf\xb7\x13\xcf\xf8\xb91\xbe\x91\xc8S\xc7z\xdbfy\xd5\xf8+\'\xa6y\xc2\xae\xed\x10\x7fR\xc0\xfd\xe9\xa4\xa3\xe0TC\x9d9\x87\xbf\x12<\xfc\x95\xfc\xc5\xdf\xb7\xe8u\x8d\xd9\x93\x8e\x82S-\x7f.\xf5\xc3g\xb2x\xbe4V\x0ev\xd3\x99\xc6\x85\x13\x97RS\xedq\xf2\xe7R?|\xe2\xcf\xdek\xac\xb8\xef\xc9N\xf1{E\xf6\xd6\xab\x8d\xec-?7X\\\\\xf7\x02\xac\xe9\x9c\xf9su\xf3\xfd\x16\xbf;\x14\xb9\xdc\x1c\xb9\x9c3\xf88!\xcd\x05\xbb\xa6\xfac\xf5\xb8\xf5\xcd\x9c\x1e\xb9\x0f\x0b1\xceu\xefxMC\xb28}\x02:\xb