In [1]:
from pprint import pprint
from random import sample
import re

from scipy.stats.mstats import spearmanr
import textacy

from colandr import create_app, db
from colandr.lib.nlp import reviewer_terms
from colandr.models import Citation, Review, ReviewPlan
from colandr.api.schemas import CitationSchema



In [2]:
app = create_app('default')

### reviewer terms

In [20]:
with app.app_context():
    
    citations = [
        {'id': result[0], 'status': result[1], 'text_content': result[2]}
        for result in db.session.query(Citation.id, Citation.status, Citation.text_content)\
                                .filter_by(review_id=1).all()]
    
    review = db.session.query(Review).get(1)
    review_plan = review.review_plan
    
    keyterms_regex = reviewer_terms.get_keyterms_regex(review_plan.keyterms)
    incl_regex, excl_regex = reviewer_terms.get_incl_excl_terms_regex(review_plan.suggested_keyterms)

In [57]:
keyterms_scores = []
incl_excl_scores = []
statuses = []

sample_size = 3000

with app.app_context():
    
    for citation in sample(citations, sample_size):
        text_content = citation['text_content']
        
        keyterms_score = reviewer_terms.get_keyterms_score(keyterms_regex, text_content)
        incl_excl_score = reviewer_terms.get_incl_excl_terms_score(incl_regex, excl_regex, text_content)
        
        keyterms_scores.append(keyterms_score)
        incl_excl_scores.append(incl_excl_score)
        
        statuses.append(1 if citation['status'] == 'included' else 0)
        
print('# included citations = {} ({}%)'.format(sum(statuses), round(100 * sum(statuses) / sample_size, 3)))
print('keyterms correlation = {}'.format(round(spearmanr(keyterms_scores, statuses).correlation, 3)))
print('incl/excl correlation = {}'.format(round(spearmanr(incl_excl_scores, statuses).correlation, 3)))

# included citations = 76 (2.533%)
keyterms correlation = 0.186
incl/excl correlation = 0.206


In [20]:
%%time

with app.app_context():
    
    results = db.session.query(Citation.text_content).filter_by(review_id=1)
    texts = (result[0] for result in results)
    corpus = textacy.Corpus('en', texts=texts)
    print(corpus)

Corpus(28708 docs; 8293029 tokens)
CPU times: user 8min 3s, sys: 1.75 s, total: 8min 5s
Wall time: 8min 6s


In [22]:
doc = corpus[10]
doc

Doc(228 tokens; "Place Perceptions and Controversies over Forest...")

In [23]:
doc.spacy_doc.to_bytes()

b'\xcb\x05\x00\x00m\xfa\xff\xffy\x14\x9e\xce<nMG7\xc6\x7f\xf2\'n\x14\xbfWdo=\xbcgdo\ty\xac{\xedf\xfc\xb9\xba\xf9~C\xeb\xa6;\xd1%\x16\xfeJ6sz\x04\xdfp\xd3\x9dx\xa6\xcb\xed\xeeB\x0c|\xf2j\x9c\xcay;\xd1%\xd6={\x9c+\xca\x19\x89t\xe1\xe9\xa4oMG7\xc6\x7f\xe2w\x8a\xdf+\xb2\xb7\x1e\xde3\xb2\xb7Lb\xddkg\xfe\\\xdd|\xbfa\xf3\xf9/\xa0n\x00Vn\x1c\xce\xc8_\xa6\xe9\x15<\r;L\x16\xcf\'\x7f.\xf5\xc3\'>\xc1;b\x87\xb7\x13]b\x9d<u\xac\xb7-}k:\xba1\xfe3\x12\xe9\xc2\xd3=\xaaI\xac{\xfd\xe4O\xa4\xa3\xe0T\xc3B\x0c|\xe2s\x8ay\x8f\xff\x88)\xf6\x90\xbf\xb7\x13\xcf\xf8\xb91\xbe\x91\xc8S\xc7z\xdbfy\xd5\xf8+\'\xa6y\xc2\xae\xed\x10\x7fR\xc0\xfd\xe9\xa4\xa3\xe0TC\x9d9\x87\xbf\x12<\xfc\x95\xfc\xc5\xdf\xb7\xe8u\x8d\xd9\x93\x8e\x82S-\x7f.\xf5\xc3g\xb2x\xbe4V\x0ev\xd3\x99\xc6\x85\x13\x97RS\xedq\xf2\xe7R?|\xe2\xcf\xdek\xac\xb8\xef\xc9N\xf1{E\xf6\xd6\xab\x8d\xec-?7X\\\\\xf7\x02\xac\xe9\x9c\xf9su\xf3\xfd\x16\xbf;\x14\xb9\xdc\x1c\xb9\x9c3\xf88!\xcd\x05\xbb\xa6\xfac\xf5\xb8\xf5\xcd\x9c\x1e\xb9\x0f\x0b1\xceu\xefxMC\xb28}\x02:\xb

In [30]:
import subprocess

result = subprocess.check_output(
    ['../scripts/extractText.sh', '-f', '/Users/burtondewilde/colandr/fulltexts/uploads/1.pdf'])
result[:100]

b'Bottrill et al. Environmental Evidence 2014, 3:16\nhttp://www.environmentalevidencejournal.org/conten'

In [32]:
print(result.decode())

Bottrill et al. Environmental Evidence 2014, 3:16
http://www.environmentalevidencejournal.org/content/3/1/16SYSTEMATIC REVIEW PROTOCOL Open AccessWhat are the impacts of nature conservation
interventions on human well-being: a systematic
map protocol
Madeleine Bottrill1*, Samantha Cheng2, Ruth Garside3, Supin Wongbusarakum4, Dilys Roe5, Margaret B Holland6,
Janet Edmond1 and Will R Turner1Abstract
Background: International policy has sought to emphasize and strengthen the link between the conservation of
natural ecosystems and human development. Furthermore, international conservation organizations have broadened
their objectives beyond nature-based goals to recognize the contribution of conservation interventions in sustaining
ecosystem services upon which human populations are dependent. While many indices have been developed to
measure various human well-being domains, the strength of evidence to support the effects, both positive and
negative, of conservation interventions on human