In [1]:
import json
import logging
import math
import re

import numpy as np
import pandas as pd
from toolz import itertoolz

import cipy
import textacy

In [2]:
logger = logging.getLogger('cipy')
logger.setLevel(logging.DEBUG)

In [3]:
conn_creds = cipy.db.get_conn_creds('DATABASE_URL')
citations_db = cipy.db.PostgresDB(conn_creds, ddl='citations')

In [4]:
project_id = 1

In [5]:
query = """
SELECT
    citation_id,
    TRIM('\n' FROM concat_ws('\n\n', COALESCE(title, ''),
                   COALESCE(abstract, ''),
                   COALESCE(array_to_string(keywords, ', '), ''))) AS citation_text
FROM citations
WHERE
    project_id = %(project_id)s
    AND (citation_id NOT IN (SELECT citation_id FROM duplicates)
         OR citation_id IN (SELECT canonical_citation_id FROM duplicates))
ORDER BY random()
LIMIT 1000
"""

df = pd.DataFrame(citations_db.run_query(query, {'project_id': project_id}))

In [6]:
print(df.shape)
df.head()

(1000, 2)


Unnamed: 0,citation_id,citation_text
0,19677,Assessing and improving health in the workplac...
1,2785,Understanding uncertainty and reducing vulnera...
2,17228,Comparative study of forest under participator...
3,27673,Biomonitoring of trace element air pollution: ...
4,4240,Thinking down or acting up? governmental and g...


In [7]:
print(df.ix[4, 'citation_text'])

Thinking down or acting up? governmental and grassroots environmental thought and action in the transboundary northern plains

It is widely recognized that the 49th parallel is an arbitrary construction. A political ecology of the Frenchman River-Bitter Creek Area, straddling the Saskatchewan-Montana border, shows that despite parallel socioeconomic and ecological conditions and threats on either side of this border, very different political and managerial regimes apply. Differences are especially pronounced with respect to grasslands conservation and management, and how these impact agricultural livelihoods. More troubling is the dearth of knowledge within each country with respect to the management activities undertaken in the other.

Community-based approaches, Conservation, Northern Great Plains, Political ecology, Transboundary environmental management


In [8]:
results = citations_db.run_query(
    cipy.db.queries.GET_CITATION_TEXTS_SAMPLE, {'project_id': project_id})
df = pd.DataFrame(results)
print(df.shape)
df.head()

(1000, 2)


Unnamed: 0,citation_id,citation_text
0,6379,Seed dispersal distances and plant migration p...
1,10392,Genotoxicity and metabolism of the source-wate...
2,9146,Risk management: A proposal for communication ...
3,23062,"Significance of toponyms, with emphasis on fie..."
4,8604,Effects of element separation on perceptual gr...


In [9]:
keyterms = {'intervention': {'conservation', 'conserve', 'conservancy',
                             'protect*', 'management', 'awareness', 'law*',
                             'policy*', 'reserve*', 'govern*', 'capacity\-build*',
                             'train*', 'regulation', 'payment for ecosystem services', 'PES',
                             'ecotourism', 'sustainable use'},
            'outcome': {'wellbeing', 'well\-being', 'well being', 'ecosystem service*',
                        'nutrition', 'skill*', 'empower*', 'clean water', 'livelihood*',
                        'food security', 'resilience*', 'vulnerability', 'social capital',
                        'attitude*', 'perception*', '(human )?health*', 'human capital',
                        '(traditional )?knowledge'},
            'intervention_qualifiers': {'marine', 'freshwater', 'coastal', 'forest*',
                                        'ecosystem*', 'species', 'habitat*', 'biodiversity',
                                        'sustainab*', 'ecolog*', 'integrated', 'landscape',
                                        'seascape', 'coral reef*', 'natural resource*'},
            'outcome_qualifiers': {'human*', 'people', 'person*', 'communit*',
                                   'household*', 'fisher*', 'collaborative'}
            }

keyterms_joined = '|'.join(val.replace('*', '*?')
                           for vals in keyterms.values()
                           for val in vals)
keyterms_re = re.compile(r'(?<=^|\b)(' + keyterms_joined + r')(?=$|\b)', flags=re.IGNORECASE)
keyterms_re

re.compile(r'(?<=^|\b)(person*?|household*?|fisher*?|communit*?|human*?|collaborative|people|human capital|(human )?health*?|(traditional )?knowledge|ecosystem service*?|resilience*?|well\-being|wellbeing|livelihood*?|well being|nutrition|social capital|perception*?|clean water|empower*?|vulnerability|food security|attitude*?|skill*?|PES|reserve*?|protect*?|conserve|conservation|train*?|govern*?|sustainable use|policy*?|awareness|capacity\-build*?|management|regulation|ecotourism|payment for ecosystem services|law*?|conservancy|marine|biodiversity|ecosystem*?|natural resource*?|coral reef*?|sustainab*?|habitat*?|species|coastal|freshwater|seascape|integrated|landscape|forest*?|ecolog*?)(?=$|\b)',
re.IGNORECASE|re.UNICODE)

In [20]:
# def get_regex_match_score(citation_text, len_pow_scaling=0.33):
#     try:
#         return (math.pow(len(citation_text), len_pow_scaling)
#                 * sum(len(match.group()) for match in keyterms_re.finditer(citation_text))
#                 / len(citation_text))
#     except ZeroDivisionError:
#         return 0.0

def get_regex_match_score(citation_text):
    full_len = len(citation_text)
    if full_len == 0:
        return 0.0
    match_len = sum(len(match.group()) for match in keyterms_re.finditer(citation_text))
    nonmatch_len = full_len - match_len
    try:
        return match_len/nonmatch_len
    except ValueError:
        print(match_len, nonmatch_len, full_len)

df['regex_match_score'] = df['citation_text'].map(lambda x: get_regex_match_score(x))
df['regex_match_pctrank'] = df['regex_match_score'].rank(pct=True, ascending=True)
df.sort_values('regex_match_pctrank', inplace=True, ascending=False)

In [23]:
df.head(10)

Unnamed: 0,citation_id,citation_text,regex_match_score,regex_match_pctrank
184,14653,The Spaces of Social Capital: Livelihood Geogr...,0.239224,1.0
367,28688,Forests and trees outside forests are essentia...,0.215592,0.999
69,23756,Values and people's participation in community...,0.200828,0.998
327,17189,On the hope for biodiversity-friendly tropical...,0.198492,0.997
473,12997,The influence of social capital on environment...,0.194245,0.996
717,952,CROP BIODIVERSITY CONSERVATION: HOPES FOR FUTU...,0.186825,0.995
969,26700,Effects of community-based collaborative group...,0.18416,0.994
428,12100,Agroforestry for biodiversity conservation of ...,0.166078,0.993
554,23715,People's participation in management of protec...,0.164223,0.992
604,7235,Motivations for conserving urban biodiversity\...,0.162476,0.991


In [22]:
for citation in itertoolz.take_nth(50, df[['citation_id', 'citation_text']].values):
    print(citation)

[14653
 "The Spaces of Social Capital: Livelihood Geographies and Marine Conservation in the Cayos Cochinos Marine Protected Area, Honduras\\n\\nThis article explores the relation between, a household's social capital, and its use of marine resources in the Cayos Cochinos Marine Reserve. Recent writings on social capital's role in facilitating community conservation efforts have highlighted the ways in which strong levels of this asset can produce positive conservation outcomes. In contrast, this paper argues that social capital formation and use at the household level can produce a geography of resource use that runs counter to the zoning-based resource restrictions that often typify co-managed conservation, areas. Drawing on ethnographic and survey work from the Cayos Cochinos Marine Protected. Area, this paper shows how marine resources help Garifuna fishing families build networks of trust and reciprocity, which in turn allows them to access marine resources and mobilize them acros

In [118]:
for foo, bar in df.iterrows():
    print(foo)
    print(bar)
    break

192
citation_id                                                           15153
citation_text             Ecological protection and well-being\n\nBased ...
match_fraction                                                     0.238359
match_fraction_pctrank                                                0.999
regex_match_score                                                   2.25139
regex_match_pctrank                                                       1
Name: 192, dtype: object


In [114]:
json.loads(df[['citation_id', 'citation_text']].head(10).to_json(orient='records'))

[{'citation_id': 15153,
  'citation_text': "Ecological protection and well-being\n\nBased on Sen's function approach, this paper stated the connotation of the well-being, and defined that the freedom choice and capability of the human is the connotation of the human well-being on ecosystem services, while the poverty is the limited for capability and development of the human. Ecosystem degradation and destruction will seriously threat to human well-being, especially the well-being of the poor. Conservation of biodiversity can contribute to the protection of ecosystem services, which as the core of the ecosystem, and then improve the well-being of human. Focused on the well-being of the poor whom strongly dependent on ecosystem services and effective implemented ecological compensation, that can achieve a win-win between ecological protection and human well-being improvement.\n\nBiodiversity, Ecosystem services, Human well-being, Poverty"},
 {'citation_id': 12152,
  'citation_text': 'Th

In [74]:
df.tail()

Unnamed: 0,authors,citation_id,citation_text,publication_year,match_fraction
375,"[Materka, E]",16607,Poland's quiet revolution: Of Shale Ga s explo...,2012.0,0.0
469,"[Cairns Jr, J]",4672,Ecosocietal restoration: Reestablishing humani...,1995.0,0.0
902,"[Rabino, I]",20948,The impact of activist pressures on recombinan...,1991.0,0.0
417,"[Kathirgamanathan, P, McKibbin, R, McLachlan, ...",13423,Inverse Modelling for Identifying the Origin a...,2003.0,0.0
433,"[Kuuskraa, Vello A]",14442,A program to accelerate the deployment of CO2 ...,2007.0,0.0


In [70]:
df.ix[318, :].to_json()

'{"authors":["Moorthy, A L","Pant, A"],"citation_id":19718,"citation_text":"Knowledge management and safeguarding Indian traditional knowledge\\n\\nTraditional knowledge (TK) is the knowledge base of a particular society or community which has been generated over a period of time through the process of learning and sharing by its members. TK is time tested and has been playing a vital role in sustainable development of the indigenous communities. Commercialization of biodiversity and traditional knowledge has increased the threat of misappropriation. Indian cultural and scientific heritage is discussed in this paper. Proper KM practices are required to tap that knowledge. This paper highlights the need to develop an Indigenous Knowledge Management System (IKMS) for indigenous communities which should document the knowledge heritage (without jeopardizing local culture, societal practices, IPRs, etc.) and it should also improve the process of adaptation, adoption and experimentation of t

In [10]:
%%time
records = pgdb.run_query(query, {'project_id': project_id})
text_stream, metadata_stream = textacy.fileio.split_content_and_metadata(records, 'text', itemwise=False)

# corpus = textacy.TextCorpus.from_texts('en', text_stream, metadata=metadata_stream)

spacy_pipeline = textacy.data.load_spacy('en')#, parser=False)
corpus = textacy.TextCorpus(spacy_pipeline)
for text, metadata in zip(text_stream, metadata_stream):
    corpus.add_text(text, lang='en', metadata=metadata)
corpus

CPU times: user 8min 18s, sys: 2.96 s, total: 8min 21s
Wall time: 8min 21s


In [7]:
%%time
records = pgdb.run_query(query, {'project_id': project_id})
text_stream, metadata_stream = textacy.fileio.split_content_and_metadata(records, 'text', itemwise=False)

# corpus = textacy.TextCorpus.from_texts('en', text_stream, metadata=metadata_stream)

spacy_pipeline = textacy.data.load_spacy('en', parser=False)
corpus = textacy.TextCorpus(spacy_pipeline)
for text, metadata in zip(text_stream, metadata_stream):
    corpus.add_text(text, lang='en', metadata=metadata)
corpus

CPU times: user 1min 58s, sys: 1.83 s, total: 2min
Wall time: 2min


In [8]:
corpus

TextCorpus(28343 docs; 7806511 tokens)

In [37]:
match_fractions = []

for doc in corpus:
    if not doc.text:
        match_fractions.append(0.0)
        continue
    match_fractions.append(
        sum(len(match.group()) for match in keyterms_re.finditer(doc.text)) / len(doc.text))

In [46]:
match_fraction_idxs = np.argsort(match_fractions)[::-1]
match_fraction_idxs

array([ 3931, 23503, 14842, ..., 10493, 10513,     0])

In [49]:
for idx in match_fraction_idxs[:5]:
    print('\n')
    print(corpus[idx].text)



Cultural erosion and biodiversity: Canoe-making knowledge in Pohnpei, Micronesia

Erosion of traditional knowledge and practice is a serious and accelerating problem, but quantitative work on traditional knowledge loss and its importance to biodiversity conservation is lacking. We investigated traditional knowledge of canoe making, a skill heavily dependent on plant biodiversity, on Pohnpei, Federated States of Micronesia, through a survey of 180 island residents. Our results showed that there has been an intergenerational erosion of canoe-making skills. Given current trends, the present generation of Pohnpeians may be the last to retain any knowledge of this traditional craft. We also identified several correlates of knowledge loss - including Western educational level and occupation - that highlight potential avenues for skill conservation via governments, traditional leadership, and schools. These institutions could intervene to emphasize traditional knowledge, which would reinfor

In [51]:
for idx in match_fraction_idxs[-8:]:
    print('\n')
    print(corpus[idx].text)






Using land-time-budgets to analyse farming systems and poverty alleviation policies in the Lao PDR

This paper applies the method of 'Land-time-budget analysis' to a rural subsistence community and to the national economy of the Lao PDR. The analysis is conducted to meet two ends: • To identify the community's/the nation's resource use profile in terms of land and time use. The analysis identifies biophysical constraints of socio-economic development and trade-offs in resource use patterns. • To contrast the results of the analysis with national poverty alleviation policies and visualise their effects on local communities. Results show that shifting cultivation, a traditional socio-economic strategy in Laos, is doomed for extinction as a practice for securing subsistence. Little, if any, provisions are made by the planners to persuade shifting cultivators to leave their trade and moving to the lowlands and urban areas. Policies are shown to actually decrease the rate of subsistenc

In [52]:
keyterms = {'intervention': {'conservation', 'conserve', 'conservancy',
                             'protect*', 'management', 'awareness', 'law*',
                             'policy*', 'reserve*', 'govern*', 'capacity-build*',
                             'train*', 'regulation', 'payment for ecosystem services', 'PES',
                             'ecotourism', 'sustainable use'},
            'outcome': {'wellbeing', 'well-being', 'well being', 'ecosystem service*',
                        'nutrition', 'skill*', 'empower*', 'clean water', 'livelihood*',
                        'food security', 'resilience*', 'vulnerability', 'social capital',
                        'attitude*', 'perception*', '(human)? health*', 'human capital',
                        '(traditional)? knowledge'},
            'intervention_qualifiers': {'marine', 'freshwater', 'coastal', 'forest*',
                                        'ecosystem*', 'species', 'habitat*', 'biodiversity',
                                        'sustainab*', 'ecolog*', 'integrated', 'landscape',
                                        'seascape', 'coral reef*', 'natural resource*'},
            'outcome_qualifiers': {'human*', 'people', 'person*', 'communit*',
                                   'household*', 'fisher*', 'collaborative'}
            }

keyterms_joined = '|'.join(val.replace('*', '*?')
                           for vals in keyterms.values()
                           for val in vals)
keyterms_re = re.compile(r'(?<=^|\b)(' + keyterms_joined + r')(?=$|\b)', flags=re.IGNORECASE)

records = pgdb.run_query(query, {'project_id': project_id})

match_fractions = []
for record in records:
    if not record['text']:
        match_fractions.append(0.0)
        continue
    match_fractions.append(
        sum(len(match.group()) for match in keyterms_re.finditer(record['text'])) / len(record['text']))

match_fraction_idxs = np.argsort(match_fractions)[::-1]

In [53]:
match_fraction_idxs

array([ 3931, 23503, 14842, ..., 10493, 10513,     0])