In [4]:
import json
import logging
import math
import re

import numpy as np
import pandas as pd
from toolz import itertoolz

import cipy
import textacy

In [5]:
logger = logging.getLogger('cipy')
logger.setLevel(logging.DEBUG)

In [6]:
conn_creds = cipy.db.get_conn_creds('DATABASE_URL')
citations_db = cipy.db.PostgresDB(conn_creds, ddl='citations')

In [7]:
project_id = cipy.hack.get_project_id()
project_id

0

In [83]:
query = """
SELECT
    citation_id,
    TRIM('\n' FROM concat_ws('\n\n', COALESCE(title, ''),
                   COALESCE(abstract, ''),
                   COALESCE(array_to_string(keywords, ', '), ''))) AS citation_text
FROM citations
WHERE
    project_id = %(project_id)s
    AND (citation_id NOT IN (SELECT citation_id FROM duplicates)
         OR citation_id IN (SELECT canonical_citation_id FROM duplicates))
ORDER BY random()
LIMIT 1000
"""

df = pd.DataFrame(citations_db.run_query(query, {'project_id': project_id}))

In [84]:
print(df.shape)
df.head()

(1000, 2)


Unnamed: 0,citation_id,citation_text
0,25677,Adaptation opportunities to climate variabilit...
1,16609,An ecological economics framework for assessin...
2,11203,Consuming fuel and fuelling consumption: Model...
3,16249,An assessment of the recreational fishery in t...
4,9489,A model of underlying socioeconomic vulnerabil...


In [86]:
print(df.ix[4, 'citation_text'])

A model of underlying socioeconomic vulnerability in human populations: Evidence from variability in population health and implications for public health

Drawing from insights into the variability of complex biologic systems we propose that the health of human populations reflects the interrelationship between underlying vulnerabilities (determined by population-level social and economic factors; e.g., income distribution) and capacities (determined by population-level salutary resources, e.g., social capital) and how populations, shaped by these vulnerabilities and capacities, respond to intermittent stressors (e.g., economic downturns) and protective events (e.g., introduction of a school). Monitoring this dynamic at the population-level can be accomplished by examining not only rates of illness and mortality, but variability in rates, either between populations or within populations over time. We used mortality data from New York City neighborhoods between 1990 and 2001 to test two

In [9]:
results = citations_db.run_query(
    cipy.db.queries.GET_CITATION_TEXTS_SAMPLE, {'project_id': project_id})
df = pd.DataFrame(results)
print(df.shape)
df.head()

(1000, 2)


Unnamed: 0,citation_id,citation_text
0,9005,"Emerging fungal threats to animal, plant and e..."
1,24206,The relation of lipid peroxidation processes w...
2,12076,Geochemical barriers and element retention in ...
3,3625,Assessing sustainable forest management under ...
4,7694,The role of the education system and on-the-jo...


In [87]:
keyterms = {'intervention': {'conservation', 'conserve', 'conservancy',
                             'protect*', 'management', 'awareness', 'law*',
                             'policy*', 'reserve*', 'govern*', 'capacity\-build*',
                             'train*', 'regulation', 'payment for ecosystem services', 'PES',
                             'ecotourism', 'sustainable use'},
            'outcome': {'wellbeing', 'well\-being', 'well being', 'ecosystem service*',
                        'nutrition', 'skill*', 'empower*', 'clean water', 'livelihood*',
                        'food security', 'resilience*', 'vulnerability', 'social capital',
                        'attitude*', 'perception*', '(human )?health*', 'human capital',
                        '(traditional )?knowledge'},
            'intervention_qualifiers': {'marine', 'freshwater', 'coastal', 'forest*',
                                        'ecosystem*', 'species', 'habitat*', 'biodiversity',
                                        'sustainab*', 'ecolog*', 'integrated', 'landscape',
                                        'seascape', 'coral reef*', 'natural resource*'},
            'outcome_qualifiers': {'human*', 'people', 'person*', 'communit*',
                                   'household*', 'fisher*', 'collaborative'}
            }

keyterms_joined = '|'.join(val.replace('*', '*?')
                           for vals in keyterms.values()
                           for val in vals)
keyterms_re = re.compile(r'(?<=^|\b)(' + keyterms_joined + r')(?=$|\b)', flags=re.IGNORECASE)
keyterms_re

re.compile(r'(?<=^|\b)(perception*?|vulnerability|clean water|well\-being|resilience*?|skill*?|ecosystem service*?|empower*?|well being|(human )?health*?|food security|(traditional)? knowledge|attitude*?|nutrition|human capital|wellbeing|livelihood*?|social capital|management|awareness|conserve|protect*?|conservation|law*?|train*?|capacity\-build*?|reserve*?|regulation|govern*?|policy*?|sustainable use|ecotourism|conservancy|PES|payment for ecosystem services|people|collaborative|household*?|human*?|fisher*?|communit*?|person*?|forest*?|freshwater|ecolog*?|marine|ecosystem*?|natural resource*?|integrated|landscape|seascape|biodiversity|coastal|sustainab*?|species|coral reef*?|habitat*?)(?=$|\b)',
re.IGNORECASE|re.UNICODE)

In [126]:
def get_regex_match_score(citation_text, len_pow_scaling=0.33):
    try:
        return (math.pow(len(citation_text), len_pow_scaling)
                * sum(len(match.group()) for match in keyterms_re.finditer(citation_text))
                / len(citation_text))
    except ZeroDivisionError:
        return 0.0

df['regex_match_score'] = df['citation_text'].map(lambda x: get_regex_match_score(x))
df['regex_match_pctrank'] = df['regex_match_score'].rank(pct=True, ascending=True)
df.sort_values('regex_match_pctrank', inplace=True, ascending=False)

In [127]:
df.head()

Unnamed: 0,citation_id,citation_text,match_fraction,match_fraction_pctrank,regex_match_score,regex_match_pctrank
192,15153,Ecological protection and well-being\n\nBased ...,0.238359,0.999,2.251387,1.0
739,12152,The potential impact of labor choices on the e...,0.193053,0.997,2.155258,0.999
575,7511,Biodiversity loss threatens human well-being,0.613636,1.0,2.139195,0.998
194,23493,Integrating Vulnerability Into Estuarine Conse...,0.180851,0.995,2.102293,0.997
261,25340,Role of social capital in determining conserva...,0.19174,0.996,2.071855,0.996


In [131]:
for citation in itertoolz.take_nth(50, df[['citation_id', 'citation_text']].values):
    print(citation)

[15153
 "Ecological protection and well-being\n\nBased on Sen's function approach, this paper stated the connotation of the well-being, and defined that the freedom choice and capability of the human is the connotation of the human well-being on ecosystem services, while the poverty is the limited for capability and development of the human. Ecosystem degradation and destruction will seriously threat to human well-being, especially the well-being of the poor. Conservation of biodiversity can contribute to the protection of ecosystem services, which as the core of the ecosystem, and then improve the well-being of human. Focused on the well-being of the poor whom strongly dependent on ecosystem services and effective implemented ecological compensation, that can achieve a win-win between ecological protection and human well-being improvement.\n\nBiodiversity, Ecosystem services, Human well-being, Poverty"]
[19973
 "The economic value of forest ecosystems\n\nForest ecosystems are being de

In [118]:
for foo, bar in df.iterrows():
    print(foo)
    print(bar)
    break

192
citation_id                                                           15153
citation_text             Ecological protection and well-being\n\nBased ...
match_fraction                                                     0.238359
match_fraction_pctrank                                                0.999
regex_match_score                                                   2.25139
regex_match_pctrank                                                       1
Name: 192, dtype: object


In [114]:
json.loads(df[['citation_id', 'citation_text']].head(10).to_json(orient='records'))

[{'citation_id': 15153,
  'citation_text': "Ecological protection and well-being\n\nBased on Sen's function approach, this paper stated the connotation of the well-being, and defined that the freedom choice and capability of the human is the connotation of the human well-being on ecosystem services, while the poverty is the limited for capability and development of the human. Ecosystem degradation and destruction will seriously threat to human well-being, especially the well-being of the poor. Conservation of biodiversity can contribute to the protection of ecosystem services, which as the core of the ecosystem, and then improve the well-being of human. Focused on the well-being of the poor whom strongly dependent on ecosystem services and effective implemented ecological compensation, that can achieve a win-win between ecological protection and human well-being improvement.\n\nBiodiversity, Ecosystem services, Human well-being, Poverty"},
 {'citation_id': 12152,
  'citation_text': 'Th

In [74]:
df.tail()

Unnamed: 0,authors,citation_id,citation_text,publication_year,match_fraction
375,"[Materka, E]",16607,Poland's quiet revolution: Of Shale Ga s explo...,2012.0,0.0
469,"[Cairns Jr, J]",4672,Ecosocietal restoration: Reestablishing humani...,1995.0,0.0
902,"[Rabino, I]",20948,The impact of activist pressures on recombinan...,1991.0,0.0
417,"[Kathirgamanathan, P, McKibbin, R, McLachlan, ...",13423,Inverse Modelling for Identifying the Origin a...,2003.0,0.0
433,"[Kuuskraa, Vello A]",14442,A program to accelerate the deployment of CO2 ...,2007.0,0.0


In [70]:
df.ix[318, :].to_json()

'{"authors":["Moorthy, A L","Pant, A"],"citation_id":19718,"citation_text":"Knowledge management and safeguarding Indian traditional knowledge\\n\\nTraditional knowledge (TK) is the knowledge base of a particular society or community which has been generated over a period of time through the process of learning and sharing by its members. TK is time tested and has been playing a vital role in sustainable development of the indigenous communities. Commercialization of biodiversity and traditional knowledge has increased the threat of misappropriation. Indian cultural and scientific heritage is discussed in this paper. Proper KM practices are required to tap that knowledge. This paper highlights the need to develop an Indigenous Knowledge Management System (IKMS) for indigenous communities which should document the knowledge heritage (without jeopardizing local culture, societal practices, IPRs, etc.) and it should also improve the process of adaptation, adoption and experimentation of t

In [10]:
%%time
records = pgdb.run_query(query, {'project_id': project_id})
text_stream, metadata_stream = textacy.fileio.split_content_and_metadata(records, 'text', itemwise=False)

# corpus = textacy.TextCorpus.from_texts('en', text_stream, metadata=metadata_stream)

spacy_pipeline = textacy.data.load_spacy('en')#, parser=False)
corpus = textacy.TextCorpus(spacy_pipeline)
for text, metadata in zip(text_stream, metadata_stream):
    corpus.add_text(text, lang='en', metadata=metadata)
corpus

CPU times: user 8min 18s, sys: 2.96 s, total: 8min 21s
Wall time: 8min 21s


In [7]:
%%time
records = pgdb.run_query(query, {'project_id': project_id})
text_stream, metadata_stream = textacy.fileio.split_content_and_metadata(records, 'text', itemwise=False)

# corpus = textacy.TextCorpus.from_texts('en', text_stream, metadata=metadata_stream)

spacy_pipeline = textacy.data.load_spacy('en', parser=False)
corpus = textacy.TextCorpus(spacy_pipeline)
for text, metadata in zip(text_stream, metadata_stream):
    corpus.add_text(text, lang='en', metadata=metadata)
corpus

CPU times: user 1min 58s, sys: 1.83 s, total: 2min
Wall time: 2min


In [8]:
corpus

TextCorpus(28343 docs; 7806511 tokens)

In [37]:
match_fractions = []

for doc in corpus:
    if not doc.text:
        match_fractions.append(0.0)
        continue
    match_fractions.append(
        sum(len(match.group()) for match in keyterms_re.finditer(doc.text)) / len(doc.text))

In [46]:
match_fraction_idxs = np.argsort(match_fractions)[::-1]
match_fraction_idxs

array([ 3931, 23503, 14842, ..., 10493, 10513,     0])

In [49]:
for idx in match_fraction_idxs[:5]:
    print('\n')
    print(corpus[idx].text)



Cultural erosion and biodiversity: Canoe-making knowledge in Pohnpei, Micronesia

Erosion of traditional knowledge and practice is a serious and accelerating problem, but quantitative work on traditional knowledge loss and its importance to biodiversity conservation is lacking. We investigated traditional knowledge of canoe making, a skill heavily dependent on plant biodiversity, on Pohnpei, Federated States of Micronesia, through a survey of 180 island residents. Our results showed that there has been an intergenerational erosion of canoe-making skills. Given current trends, the present generation of Pohnpeians may be the last to retain any knowledge of this traditional craft. We also identified several correlates of knowledge loss - including Western educational level and occupation - that highlight potential avenues for skill conservation via governments, traditional leadership, and schools. These institutions could intervene to emphasize traditional knowledge, which would reinfor

In [51]:
for idx in match_fraction_idxs[-8:]:
    print('\n')
    print(corpus[idx].text)






Using land-time-budgets to analyse farming systems and poverty alleviation policies in the Lao PDR

This paper applies the method of 'Land-time-budget analysis' to a rural subsistence community and to the national economy of the Lao PDR. The analysis is conducted to meet two ends: • To identify the community's/the nation's resource use profile in terms of land and time use. The analysis identifies biophysical constraints of socio-economic development and trade-offs in resource use patterns. • To contrast the results of the analysis with national poverty alleviation policies and visualise their effects on local communities. Results show that shifting cultivation, a traditional socio-economic strategy in Laos, is doomed for extinction as a practice for securing subsistence. Little, if any, provisions are made by the planners to persuade shifting cultivators to leave their trade and moving to the lowlands and urban areas. Policies are shown to actually decrease the rate of subsistenc

In [13]:
EMAIL_REGEX = re.compile(r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(\.([a-z]{2,})){1,3}(?:$|(?=\b))", flags=re.IGNORECASE | re.UNICODE)

EMAIL_REGEX.search('burtondewilde') is not None

False

In [52]:
keyterms = {'intervention': {'conservation', 'conserve', 'conservancy',
                             'protect*', 'management', 'awareness', 'law*',
                             'policy*', 'reserve*', 'govern*', 'capacity-build*',
                             'train*', 'regulation', 'payment for ecosystem services', 'PES',
                             'ecotourism', 'sustainable use'},
            'outcome': {'wellbeing', 'well-being', 'well being', 'ecosystem service*',
                        'nutrition', 'skill*', 'empower*', 'clean water', 'livelihood*',
                        'food security', 'resilience*', 'vulnerability', 'social capital',
                        'attitude*', 'perception*', '(human)? health*', 'human capital',
                        '(traditional)? knowledge'},
            'intervention_qualifiers': {'marine', 'freshwater', 'coastal', 'forest*',
                                        'ecosystem*', 'species', 'habitat*', 'biodiversity',
                                        'sustainab*', 'ecolog*', 'integrated', 'landscape',
                                        'seascape', 'coral reef*', 'natural resource*'},
            'outcome_qualifiers': {'human*', 'people', 'person*', 'communit*',
                                   'household*', 'fisher*', 'collaborative'}
            }

keyterms_joined = '|'.join(val.replace('*', '*?')
                           for vals in keyterms.values()
                           for val in vals)
keyterms_re = re.compile(r'(?<=^|\b)(' + keyterms_joined + r')(?=$|\b)', flags=re.IGNORECASE)

records = pgdb.run_query(query, {'project_id': project_id})

match_fractions = []
for record in records:
    if not record['text']:
        match_fractions.append(0.0)
        continue
    match_fractions.append(
        sum(len(match.group()) for match in keyterms_re.finditer(record['text'])) / len(record['text']))

match_fraction_idxs = np.argsort(match_fractions)[::-1]

In [53]:
match_fraction_idxs

array([ 3931, 23503, 14842, ..., 10493, 10513,     0])