In [1]:
import io
import json
import logging
import math
import re

import numpy as np
import pandas as pd
from toolz import itertoolz

import cipy
import textacy

In [2]:
logger = logging.getLogger('cipy')
logger.setLevel(logging.DEBUG)

In [3]:
conn_creds = cipy.db.get_conn_creds('DATABASE_URL')
pgdb = cipy.db.PostgresDB(conn_creds)

In [4]:
review_id = 1
user_id = 1

In [5]:
# load (known!) include/exclude decisions
selection_data = cipy.hacks.load_citation_selection_data()
list(selection_data.items())[:10]

[(1, False),
 (2, False),
 (3, False),
 (4, False),
 (5, False),
 (6, False),
 (7, False),
 (8, False),
 (9, False),
 (10, False)]

In [6]:
query = cipy.db.queries.SELECT_CITATIONS_TO_SCREEN
bindings = {'review_id': 1, 'sample_size': 1000}
df = pd.DataFrame(pgdb.run_query(query, bindings))
df['included'] = df['citation_id'].map(lambda cid: selection_data[cid])

print(df.shape)
df.head()

(1000, 10)


Unnamed: 0,abstract,authors,citation_id,citation_screening,citation_text,doi,keywords,publication_year,title,included
0,"Based on a schematical model, an attempt is ma...","[Appasamy, P, Lundqvist, J]",1189,,WATER-SUPPLY AND WASTE-DISPOSAL STRATEGIES FOR...,,,1993.0,WATER-SUPPLY AND WASTE-DISPOSAL STRATEGIES FOR...,False
1,In light of the increasing mandate for greater...,"[Papageorgiou, K]",19732,,A combined park management framework based on ...,,"[Conservation, Education, Greece, Management s...",2001.0,A combined park management framework based on ...,False
2,Ecosystems provide important services that can...,"[Djoudi, H, Locatelli, B, Pramova, E, Somorin,...",20692,,Forests and trees for social adaptation to cli...,,,2012.0,Forests and trees for social adaptation to cli...,False
3,The directorate-general of the European Commis...,"[Kranz, O, Lang, S, Stephen Clandillon, S]",14214,,Earth observation in conflict\n\nThe directora...,,,2009.0,Earth observation in conflict,False
4,The assumption of this paper is therefore that...,"[Malherbe, B]",16149,,Indigenous knowledge and practice in HIV/AIDS ...,,,2002.0,Indigenous knowledge and practice in HIV/AIDS ...,False


In [63]:
df[df['keywords'].isnull()].apply(lambda x: 1, axis=1)

6      1
10     1
41     1
46     1
51     1
56     1
57     1
68     1
72     1
75     1
80     1
96     1
114    1
120    1
123    1
127    1
131    1
149    1
163    1
166    1
168    1
171    1
173    1
179    1
184    1
192    1
198    1
206    1
212    1
213    1
      ..
914    1
915    1
916    1
917    1
918    1
919    1
925    1
931    1
939    1
940    1
946    1
948    1
949    1
950    1
955    1
958    1
960    1
972    1
978    1
979    1
989    1
990    1
991    1
993    1
994    1
995    1
996    1
997    1
998    1
999    1
dtype: int64

In [71]:
def build_citation_text(row):
    title = row['title'] or ''
    abstract = row['abstract'] or ''
    keywords = '; '.join(row['keywords']) if row['keywords'] else ''
    text = '\n\n'.join((title, abstract, keywords)).strip()
    return text

print(df[df['abstract'].isnull()].apply(build_citation_text, axis=1).iloc[0])

Using psychology to save biodiversity and human well-being


In [9]:
print('# included =', df['included'].sum())

# included = 23


In [10]:
print(df.ix[0, 'citation_text'])

WATER-SUPPLY AND WASTE-DISPOSAL STRATEGIES FOR MADRAS

Based on a schematical model, an attempt is made to relate the magnitude of material flow into and out of Madras, a coastal city in Southeast India. The focus is on the role of the household sector. Water scarcity and seasonal variation in precipitation together with a flat topography are significant problems for proper material-flow management. The comparatively large number of people living in slums, with virtually no sanitary or other facilities, means that the activities and livelihood of about 1.5 million people are outside the control of most planning efforts. In addition, the large cattle population within the city contributes to a wide range of environmental and health hazards. The prevailing socioeconomic conditions mean that most material is reused or recycled. The average amount of garbage is less than one kilo per person and day. However, due to the size of the city and the overall amount of waste it produces, more effo

In [11]:
query = "SELECT keyterms FROM review_plans WHERE review_id = %(review_id)s"
keyterms = list(pgdb.run_query(query, {'review_id': review_id}))[0]['keyterms']
keyterms[:5]

[{'group': 'outcome',
  'synonyms': ['well-being', 'well being'],
  'term': 'wellbeing'},
 {'group': 'outcome',
  'synonyms': ['ecosystem services'],
  'term': 'ecosystem service'},
 {'group': 'outcome', 'synonyms': [], 'term': 'nutrition'},
 {'group': 'outcome', 'synonyms': ['skills'], 'term': 'skill'},
 {'group': 'outcome', 'synonyms': ['empowering'], 'term': 'empower'}]

In [12]:
keyterms_regex = cipy.utils.get_keyterms_regex(keyterms)
keyterms_regex

re.compile(r'(?<=^|\b)(wellbeing|well\-being|well\ being|ecosystem\ service|ecosystem\ services|nutrition|skill|skills|empower|empowering|clean\ water|livelihood|livelihoods|food\ security|resilience|vulnerability|capital|social\ capital|attitude|attitudes|perception|perceptions|health|human\ health|human\ capital|knowledge|traditional\ knowledge|marine|freshwater|coastal|forest|forests|forestry|ecosystem|ecosystems|species|habitat|habitats|biodiversity|sustainable|sustainability|ecology|ecological|integrated|landscape|seascape|coral\ reef|coral\ reefs|natural\ resources|natural\ resource|human|humans|humanity|people|person|persons|community|communities|household|households|fishermen|fisherman|collaborative|conservation|conserve|conservancy|protect|protects|protection|management|awareness|law|laws|policy|policy\-making|reserve|govern|governs|government|capacity\-build|capacity\-building|capacity\ building|train|tarins|training|PES|ecotourism|eco\-tourism|sustainable\ use)(?=$|\b)',
re.

In [36]:
def get_keyterms_regex_match_score(citation_text):
    full_len = len(citation_text)
    if full_len == 0:
        return 0.0
    match_len = sum(len(match.group()) for match in keyterms_regex.finditer(citation_text))
    nonmatch_len = full_len - match_len
    try:
        return math.sqrt(full_len) * match_len / nonmatch_len
    except ValueError:
        print(match_len, nonmatch_len, full_len)
        
        
# def get_keyterms_regex_match_score(citation_text, len_pow_scaling=0.5):
#     try:
#         return (math.pow(len(citation_text), len_pow_scaling)
#                 * sum(len(match.group()) for match in keyterms_regex.finditer(citation_text))
#                 / len(citation_text))
#     except ZeroDivisionError:
#         return 0.0
    
    
# def get_keyterms_regex_match_score(citation_text, len_pow_scaling=0.5):
#     text_len = len(citation_text)
#     if text_len == 0:
#         return 0.0
#     return sum(1 for _ in keyterms_regex.finditer(citation_text)) / math.pow(text_len, len_pow_scaling)


df['regex_match_score'] = df['citation_text'].map(lambda x: get_keyterms_regex_match_score(x))
df['regex_match_pctrank'] = df['regex_match_score'].rank(pct=True, ascending=True)
df.sort_values('regex_match_pctrank', inplace=True, ascending=False)

In [37]:
df.reset_index(drop=True, inplace=True)

In [38]:
df.head(10)

Unnamed: 0,abstract,authors,citation_id,citation_screening,citation_text,doi,keywords,publication_year,title,included,regex_match_score,regex_match_pctrank
0,The patterns of forest resource use in South K...,"[Youn, Y C]",28123,,"Use of forest resources, traditional forest-re...",,"[Accessibility, Forest resource use, Livelihoo...",2009.0,"Use of forest resources, traditional forest-re...",False,12.029494,1.0
1,In the Cardamom Ranges (Cambodia) community-ba...,"[Beilin, R, Lo Cascio, A]",15507,,Of biodiversity and boundaries: A case study o...,,"[biodiversity conservation, community-based na...",2010.0,Of biodiversity and boundaries: A case study o...,False,11.407173,0.999
2,Indigenous people have a vital role in environ...,"[Chun, J]",5842,,A legal approach to induce the traditional kno...,,"[Common law, Indigenous people, Natural resour...",2014.0,A legal approach to induce the traditional kno...,False,10.346002,0.998
3,This chapter describes how ecosystem stewardsh...,"[Chapin, F S, Seastedt, T R, Suding, K N]",23012,,Ecosystem Stewardship as a Framework for Conse...,,"[Ecosystem management, Ecosystem services, Eco...",2013.0,Ecosystem Stewardship as a Framework for Conse...,False,9.894718,0.997
4,This paper presents a synthesis of grassroots ...,"[Pilgrim, S, Pretty, J, Singh, R K]",23730,,Traditional knowledge and biocultural diversit...,,"[Biocultural diversity, Conservation, Liveliho...",2010.0,Traditional knowledge and biocultural diversit...,False,9.823599,0.996
5,Incentives used to encourage local residents t...,"[Jackson, M M, Naughton-Treves, L]",12559,,Eco-bursaries as incentives for conservation a...,,"[Kenya, attitudes, conservation, park-people r...",2012.0,Eco-bursaries as incentives for conservation a...,True,9.425974,0.995
6,There is no denying the close linkage between ...,"[Blanco, E, Razzaque, J]",3241,,Ecosystem services and human well-being in a g...,,,2009.0,Ecosystem services and human well-being in a g...,False,9.4116,0.994
7,Landscape ecology is in a position to become t...,"[Opdam, P, Termorshuizen, J W]",25150,,Landscape services as a bridge between landsca...,,"[Collaborative spatial planning, Ecosystem ser...",2009.0,Landscape services as a bridge between landsca...,False,9.187603,0.993
8,The ecological risk assessment is currently th...,"[Chen, W, Gao, B, He, X, Li, X, Li, Z, Qi, S]",9561,,Assessment of ecological risk of coastal econo...,,"[Ecological risk, Jinzhou Bay, Landscape patte...",2011.0,Assessment of ecological risk of coastal econo...,False,8.79825,0.992
9,Environmental perception has been drowning mor...,"[Zhao, X]",28419,,Environmental perception of farmers of differe...,,"[Environmental perception, Gannan plateau, Liv...",2012.0,Environmental perception of farmers of differe...,False,8.655191,0.991


In [39]:
# df[df['included'] == True]['citation_id']

In [40]:
df[['included', 'regex_match_score']].corr(method='spearman')

Unnamed: 0,included,regex_match_score
included,1.0,0.155604
regex_match_score,0.155604,1.0


In [43]:
for idx, row in df.iterrows():
    if row['citation_screening'] and any(cs['screened_by'] == user_id for cs in row['citation_screening']):
        print(row['citation_id'], 'already screened by you!')
        continue
    dict_row = row.to_dict()
    cipy.utils.present_citation(dict_row)
    print('\nINCLUDED?', selection_data[row['citation_id']])
#     print(row['citation_id'])
#     print(row['citation_text'])
    if idx > 10:
        break


TITLE:    Use of forest resources, traditional forest-related knowledge and livelihood of forest dependent communities: Cases in South Korea
YEAR:     2009.0
AUTHORS:  Youn, Y C
ABSTRACT: The patterns of forest resource use in South Korea have been overviewed along with the forest resource availability to the forest users and in relation to the socio-economic conditions of local people. In South Korea, forest income arises more from non-timber forest products (NTFPs) and forest ecosystems services than from timber. The relationship between availability of forest resources and income of residents in mountainous villages was addressed with statistical analysis of results of household surveys conducted in Gangwon-do Province. The result indicates that the mere existence of forest resources and related cultural heritages is not enough for local communities to obtain income from forest land. Proper arrangements for local communities in accessing the forest resources and knowledge of making

In [38]:
from scipy.stats import spearmanr

spearmanr(df['included'].values, df['regex_match_score'].values)

SpearmanrResult(correlation=0.19789240249525078, pvalue=2.7436948282527215e-10)

In [22]:
for citation in itertoolz.take_nth(50, df[['citation_id', 'citation_text']].values):
    print(citation)

[14653
 "The Spaces of Social Capital: Livelihood Geographies and Marine Conservation in the Cayos Cochinos Marine Protected Area, Honduras\\n\\nThis article explores the relation between, a household's social capital, and its use of marine resources in the Cayos Cochinos Marine Reserve. Recent writings on social capital's role in facilitating community conservation efforts have highlighted the ways in which strong levels of this asset can produce positive conservation outcomes. In contrast, this paper argues that social capital formation and use at the household level can produce a geography of resource use that runs counter to the zoning-based resource restrictions that often typify co-managed conservation, areas. Drawing on ethnographic and survey work from the Cayos Cochinos Marine Protected. Area, this paper shows how marine resources help Garifuna fishing families build networks of trust and reciprocity, which in turn allows them to access marine resources and mobilize them acros

In [118]:
for foo, bar in df.iterrows():
    print(foo)
    print(bar)
    break

192
citation_id                                                           15153
citation_text             Ecological protection and well-being\n\nBased ...
match_fraction                                                     0.238359
match_fraction_pctrank                                                0.999
regex_match_score                                                   2.25139
regex_match_pctrank                                                       1
Name: 192, dtype: object


In [114]:
json.loads(df[['citation_id', 'citation_text']].head(10).to_json(orient='records'))

[{'citation_id': 15153,
  'citation_text': "Ecological protection and well-being\n\nBased on Sen's function approach, this paper stated the connotation of the well-being, and defined that the freedom choice and capability of the human is the connotation of the human well-being on ecosystem services, while the poverty is the limited for capability and development of the human. Ecosystem degradation and destruction will seriously threat to human well-being, especially the well-being of the poor. Conservation of biodiversity can contribute to the protection of ecosystem services, which as the core of the ecosystem, and then improve the well-being of human. Focused on the well-being of the poor whom strongly dependent on ecosystem services and effective implemented ecological compensation, that can achieve a win-win between ecological protection and human well-being improvement.\n\nBiodiversity, Ecosystem services, Human well-being, Poverty"},
 {'citation_id': 12152,
  'citation_text': 'Th

In [74]:
df.tail()

Unnamed: 0,authors,citation_id,citation_text,publication_year,match_fraction
375,"[Materka, E]",16607,Poland's quiet revolution: Of Shale Ga s explo...,2012.0,0.0
469,"[Cairns Jr, J]",4672,Ecosocietal restoration: Reestablishing humani...,1995.0,0.0
902,"[Rabino, I]",20948,The impact of activist pressures on recombinan...,1991.0,0.0
417,"[Kathirgamanathan, P, McKibbin, R, McLachlan, ...",13423,Inverse Modelling for Identifying the Origin a...,2003.0,0.0
433,"[Kuuskraa, Vello A]",14442,A program to accelerate the deployment of CO2 ...,2007.0,0.0


In [70]:
df.ix[318, :].to_json()

'{"authors":["Moorthy, A L","Pant, A"],"citation_id":19718,"citation_text":"Knowledge management and safeguarding Indian traditional knowledge\\n\\nTraditional knowledge (TK) is the knowledge base of a particular society or community which has been generated over a period of time through the process of learning and sharing by its members. TK is time tested and has been playing a vital role in sustainable development of the indigenous communities. Commercialization of biodiversity and traditional knowledge has increased the threat of misappropriation. Indian cultural and scientific heritage is discussed in this paper. Proper KM practices are required to tap that knowledge. This paper highlights the need to develop an Indigenous Knowledge Management System (IKMS) for indigenous communities which should document the knowledge heritage (without jeopardizing local culture, societal practices, IPRs, etc.) and it should also improve the process of adaptation, adoption and experimentation of t

In [10]:
%%time
records = pgdb.run_query(query, {'review_id': review_id})
text_stream, metadata_stream = textacy.fileio.split_content_and_metadata(records, 'text', itemwise=False)

# corpus = textacy.TextCorpus.from_texts('en', text_stream, metadata=metadata_stream)

spacy_pipeline = textacy.data.load_spacy('en')#, parser=False)
corpus = textacy.TextCorpus(spacy_pipeline)
for text, metadata in zip(text_stream, metadata_stream):
    corpus.add_text(text, lang='en', metadata=metadata)
corpus

CPU times: user 8min 18s, sys: 2.96 s, total: 8min 21s
Wall time: 8min 21s


In [7]:
%%time
records = pgdb.run_query(query, {'review_id': review_id})
text_stream, metadata_stream = textacy.fileio.split_content_and_metadata(records, 'text', itemwise=False)

# corpus = textacy.TextCorpus.from_texts('en', text_stream, metadata=metadata_stream)

spacy_pipeline = textacy.data.load_spacy('en', parser=False)
corpus = textacy.TextCorpus(spacy_pipeline)
for text, metadata in zip(text_stream, metadata_stream):
    corpus.add_text(text, lang='en', metadata=metadata)
corpus

CPU times: user 1min 58s, sys: 1.83 s, total: 2min
Wall time: 2min


In [8]:
corpus

TextCorpus(28343 docs; 7806511 tokens)

In [37]:
match_fractions = []

for doc in corpus:
    if not doc.text:
        match_fractions.append(0.0)
        continue
    match_fractions.append(
        sum(len(match.group()) for match in keyterms_re.finditer(doc.text)) / len(doc.text))

In [46]:
match_fraction_idxs = np.argsort(match_fractions)[::-1]
match_fraction_idxs

array([ 3931, 23503, 14842, ..., 10493, 10513,     0])

In [49]:
for idx in match_fraction_idxs[:5]:
    print('\n')
    print(corpus[idx].text)



Cultural erosion and biodiversity: Canoe-making knowledge in Pohnpei, Micronesia

Erosion of traditional knowledge and practice is a serious and accelerating problem, but quantitative work on traditional knowledge loss and its importance to biodiversity conservation is lacking. We investigated traditional knowledge of canoe making, a skill heavily dependent on plant biodiversity, on Pohnpei, Federated States of Micronesia, through a survey of 180 island residents. Our results showed that there has been an intergenerational erosion of canoe-making skills. Given current trends, the present generation of Pohnpeians may be the last to retain any knowledge of this traditional craft. We also identified several correlates of knowledge loss - including Western educational level and occupation - that highlight potential avenues for skill conservation via governments, traditional leadership, and schools. These institutions could intervene to emphasize traditional knowledge, which would reinfor

In [51]:
for idx in match_fraction_idxs[-8:]:
    print('\n')
    print(corpus[idx].text)






Using land-time-budgets to analyse farming systems and poverty alleviation policies in the Lao PDR

This paper applies the method of 'Land-time-budget analysis' to a rural subsistence community and to the national economy of the Lao PDR. The analysis is conducted to meet two ends: • To identify the community's/the nation's resource use profile in terms of land and time use. The analysis identifies biophysical constraints of socio-economic development and trade-offs in resource use patterns. • To contrast the results of the analysis with national poverty alleviation policies and visualise their effects on local communities. Results show that shifting cultivation, a traditional socio-economic strategy in Laos, is doomed for extinction as a practice for securing subsistence. Little, if any, provisions are made by the planners to persuade shifting cultivators to leave their trade and moving to the lowlands and urban areas. Policies are shown to actually decrease the rate of subsistenc

In [52]:
keyterms = {'intervention': {'conservation', 'conserve', 'conservancy',
                             'protect*', 'management', 'awareness', 'law*',
                             'policy*', 'reserve*', 'govern*', 'capacity-build*',
                             'train*', 'regulation', 'payment for ecosystem services', 'PES',
                             'ecotourism', 'sustainable use'},
            'outcome': {'wellbeing', 'well-being', 'well being', 'ecosystem service*',
                        'nutrition', 'skill*', 'empower*', 'clean water', 'livelihood*',
                        'food security', 'resilience*', 'vulnerability', 'social capital',
                        'attitude*', 'perception*', '(human)? health*', 'human capital',
                        '(traditional)? knowledge'},
            'intervention_qualifiers': {'marine', 'freshwater', 'coastal', 'forest*',
                                        'ecosystem*', 'species', 'habitat*', 'biodiversity',
                                        'sustainab*', 'ecolog*', 'integrated', 'landscape',
                                        'seascape', 'coral reef*', 'natural resource*'},
            'outcome_qualifiers': {'human*', 'people', 'person*', 'communit*',
                                   'household*', 'fisher*', 'collaborative'}
            }

keyterms_joined = '|'.join(val.replace('*', '*?')
                           for vals in keyterms.values()
                           for val in vals)
keyterms_re = re.compile(r'(?<=^|\b)(' + keyterms_joined + r')(?=$|\b)', flags=re.IGNORECASE)

records = pgdb.run_query(query, {'review_id': review_id})

match_fractions = []
for record in records:
    if not record['text']:
        match_fractions.append(0.0)
        continue
    match_fractions.append(
        sum(len(match.group()) for match in keyterms_re.finditer(record['text'])) / len(record['text']))

match_fraction_idxs = np.argsort(match_fractions)[::-1]

In [53]:
match_fraction_idxs

array([ 3931, 23503, 14842, ..., 10493, 10513,     0])