In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import logging

import cipy

logger = logging.getLogger()
handler = logger.handlers[0]
handler.setLevel(logging.CRITICAL)

conn_creds = cipy.db.get_conn_creds('DATABASE_URL')
pgdb = cipy.db.PostgresDB(conn_creds)

---

## User Management

- create new user accounts and store passwords in a secure format
- delete existing user accounts (along with any owned reviews)
- user login

In [2]:
%run ../scripts/create_user.py --test

Enter user name: Bob Minnich
Enter user email: bob@gmail.com
Confirm user email: test




Enter user email: bob@gmail.com
Confirm user email: bob@gmail.com
Enter password: ········
Confirm password: ········


2016-07-27 18:13:54,239 - create_user - INFO - created user (TEST): {'owned_review_ids': None, 'created_ts': '2016-07-27T22:12:07.355207Z', 'name': 'Bob Minnich', 'email': 'bob@gmail.com', 'review_ids': None}


In [3]:
list(pgdb.run_query('SELECT * from users'))

[{'created_ts': datetime.datetime(2016, 7, 7, 17, 56, 7),
  'email': 'burtdewilde@gmail.com',
  'name': 'Burton DeWilde',
  'owned_review_ids': [1],
  'password': '$2a$06$5qfLF4y/sfXkc8XhZ360i.48V5GaQfxF5Uy8zVJcO6dLmUqX9JGie',
  'review_ids': [1],
  'user_id': 1},
 {'created_ts': datetime.datetime(2016, 7, 13, 1, 48, 12),
  'email': 'samc@gmail.com',
  'name': 'Sam C',
  'owned_review_ids': None,
  'password': '$2a$06$MHhP.XjRGfSO7eiDiNLFieJEGoOXxQruc6.Sd3nb.vImvoAuO09x.',
  'review_ids': [],
  'user_id': 3}]

In [4]:
%run ../scripts/login_user.py

Enter email: burtdewilde@gmail.com
Enter password: ········


2016-07-27 18:14:31,237 - login_user - INFO - Welcome, Burton DeWilde id=1


In [5]:
%run ../scripts/delete_user.py --user_id=3 --test

2016-07-27 18:14:42,558 - delete_user - INFO - deleted user id=3 from reviews (TEST)
2016-07-27 18:14:42,559 - delete_user - INFO - deleted user id=3 (TEST)


---

## Review Management

- create new reviews (with user as owner)
- delete existing owned reviews
- invite/uninvite other users to collaborate on existing reviews
- assign other user as owned review's new owner

In [6]:
%run ../scripts/create_review.py --user_id=1 --test

Review name: My Great Systematic Review
Review description (optional):
Find out everything about <stuff>.


2016-07-27 18:15:14,474 - create_review - INFO - created review (TEST): {'created_ts': '2016-07-27T22:12:07.351750Z', 'settings': {'num_fulltext_screening_reviewers': 2, 'num_citation_screening_reviewers': 2, 'required_fulltext_screener_id': None, 'required_citation_screener_id': None}, 'name': 'My Great Systematic Review', 'description': 'Find out everything about <stuff>.', 'user_ids': [1], 'owner_user_id': 1}


In [7]:
list(pgdb.run_query("SELECT * FROM reviews where review_id=1"))[0]

{'created_ts': datetime.datetime(2016, 7, 13, 1, 40, 27),
 'description': 'International policy has sought to emphasize and strengthen the link between the conservation of natural ecosystems and human development. Furthermore, international conservation organizations have broadened their objectives beyond nature-based goals to recognize the contribution of conservation interventions in sustaining ecosystem services upon which human populations are dependent. While many indices have been developed to measure various human well-being domains, the strength of evidence to support the effects, both positive and negative, of conservation interventions on human well-being, is still unclear.\\n\\nThis protocol describes the methodology for examining the research question: What are the impacts of nature conservation interventions on different domains of human well-being in developing countries? Using systematic mapping, this study will scope and identify studies that measure the impacts of natu

In [8]:
%run ../scripts/delete_review.py --user_id=1 --review_id=1 --test

2016-07-27 18:15:47,010 - delete_review - INFO - deleted review id=1 (TEST)


In [9]:
%run ../scripts/manage_collaborators.py --owner_user_id=1 --review_id=1 --add_user_emails "samc@gmail.com"

2016-07-27 18:15:54,321 - manage_collaborators - INFO - user id=3 added as collaborator to review id=1 


In [10]:
print('review:', list(pgdb.run_query('SELECT review_id, user_ids FROM reviews WHERE review_id=1')))
print('user:', list(pgdb.run_query('SELECT user_id, review_ids FROM users WHERE user_id=3')))

review: [{'user_ids': [1, 3], 'review_id': 1}]
user: [{'user_id': 3, 'review_ids': [1]}]


In [11]:
%run ../scripts/manage_collaborators.py --owner_user_id=1 --review_id=1 --remove_user_emails "samc@gmail.com"

2016-07-27 18:16:10,725 - manage_collaborators - INFO - user id=3 removed as collaborator to review id=1 


In [12]:
print('review:', list(pgdb.run_query('SELECT review_id, user_ids FROM reviews WHERE review_id=1')))
print('user:', list(pgdb.run_query('SELECT user_id, review_ids FROM users WHERE user_id=3')))

review: [{'user_ids': [1], 'review_id': 1}]
user: [{'user_id': 3, 'review_ids': []}]


---

## Review Planning

- facilitate systematic review planning while also gathering structured data that informs and is informed by the citation pre-screening process; user entry of the following fields:
    - objective
    - research questions, ranked
    - PICO statements
    - grouped keyterms (with automatic boolean search query generation)
    - selection criteria, with shorthand labels
- automatically generate boolean search queries from given keyterms
- after enough citations have been screened, suggest good/bad keyterms for search query

In [13]:
%run ../scripts/plan_review.py --user_id=1 --review_id=1 --test


PROJECT PLAN

Objective:
To assess and characterize the current state and distribution of the existing evidence base around the causal linkages between both positive and negative effects of nature conservation and human well-being.

Research Questions:
 0 What are the impacts of nature conservation interventions on different domains of human well-being in developing countries?
 1 What is the current state and distribution of evidence?
 2 What types of impacts from conservation interventions on human well-being are measured?
 3 What types of ecosystem services are explicitly associated with the impacts of conservation interventions on human well-being?
 4 What populations are affected by conservation and/ or focus of studies?
 5 How does the evidence base align with major priorities and investments of implementing agencies?

PICO:
- Population    : Human populations, including individuals, households, communities or nation states in non-OECD countries
- Intervention  : Adoption or impl

2016-07-27 18:18:17,292 - plan_review - INFO - valid record: review_id=1 with {'pico', 'research_questions', 'objective', 'keyterms', 'selection_criteria'}, (TEST)



PROJECT PLAN

Objective:
To assess and characterize the current state and distribution of the existing evidence base around the causal linkages between both positive and negative effects of nature conservation and human well-being.

Research Questions:
 0 What are the impacts of nature conservation interventions on different domains of human well-being in developing countries?
 1 What is the current state and distribution of evidence?
 2 What types of impacts from conservation interventions on human well-being are measured?
 3 What types of ecosystem services are explicitly associated with the impacts of conservation interventions on human well-being?
 4 What populations are affected by conservation and/ or focus of studies?
 5 How does the evidence base align with major priorities and investments of implementing agencies?

PICO:
- Population    : Human populations, including individuals, households, communities or nation states in non-OECD countries
- Intervention  : Adoption or impl

In [14]:
query = "SELECT keyterms FROM review_plans WHERE review_id = 1"
keyterms = list(pgdb.run_query(query))[0]['keyterms']
print(cipy.utils.get_boolean_search_query(keyterms))

(("wellbeing" OR "well-being" OR "well being") OR ("ecosystem service" OR "ecosystem services") OR "nutrition" OR ("skill" OR "skills") OR ("empower" OR "empowering") OR ("clean water" OR "livelihood") OR ("livelihoods" OR "food security") OR ("resilience" OR "vulnerability") OR ("capital" OR "social capital") OR ("attitude" OR "attitudes") OR ("perception" OR "perceptions") OR ("health" OR "human health") OR ("human capital" OR "knowledge") OR "traditional knowledge")
AND
(("marine" OR "freshwater") OR "coastal" OR ("forest" OR "forests" OR "forestry") OR ("ecosystem" OR "ecosystems") OR "species" OR ("habitat" OR "habitats") OR "biodiversity" OR ("sustainable" OR "sustainability") OR ("ecology" OR "ecological") OR "integrated" OR "landscape" OR "seascape" OR ("coral reef" OR "coral reefs") OR ("natural resources" OR "natural resource"))
AND
(("human" OR "humans" OR "humanity") OR "people" OR ("person" OR "persons") OR ("community" OR "communities") OR ("household" OR "households") OR

---

## Citation Ingestion and De-duplication

- load citations from RIS or BibTex files then parse, standardize, sanitize, validate, and store the data
- identify duplicate citations using a sophisticated model and assign the most complete record in a set of duplicates as the "canonical" record

In [15]:
%run ../scripts/ingest_citations.py --citations ../data/raw/citation_files/phase_2_demo_citations.ris --user_id=1 --review_id=1 --test

2016-07-27 18:23:44,353 - ingest_citations - INFO - parsing records in ../data/raw/citation_files/phase_2_demo_citations.ris
2016-07-27 18:23:44,355 - ingest_citations - INFO - valid record: Ecological protection and well-being, 2013
2016-07-27 18:23:44,356 - ingest_citations - INFO - valid record: The economic value of forest ecosystems, 2001
2016-07-27 18:23:44,357 - ingest_citations - INFO - valid record: Contribution of tourism development to protected area management: Local stakeholder perspectives, 2009
2016-07-27 18:23:44,357 - ingest_citations - INFO - 3 valid records inserted into appname db (TEST)


In [16]:
num_citations = list(pgdb.run_query('SELECT COUNT(1) FROM citations WHERE review_id = 1'))[0]['count']
print('total # citations =', num_citations)

total # citations = 28709


In [17]:
%run ../scripts/dedupe_records.py --review_id=1 --threshold=auto --settings=../models/dedupe_citations_settings --test

2016-07-27 18:24:07,802 - dedupe_records - INFO - reading dedupe settings from ../models/dedupe_citations_settings
2016-07-27 18:24:07,804 - dedupe_records - INFO - creating dedupe_blocking_map table...
2016-07-27 18:24:45,659 - dedupe_records - INFO - query: "COPY dedupe_blocking_map FROM STDIN CSV /var/folders/w7/dfn6swb12cv8sfbshhqfwwb00000gn/T/blocks_brnw8s09"
2016-07-27 18:24:45,660 - dedupe_records - INFO - creating dedupe_plural_key and dedupe_plural_block tables...
2016-07-27 18:24:45,661 - dedupe_records - INFO - creating dedupe_covered_blocks table...
2016-07-27 18:24:45,662 - dedupe_records - INFO - creating dedupe_smaller_coverage table...
2016-07-27 18:24:45,663 - dedupe_records - INFO - reading dedupe settings from ../models/dedupe_citations_settings
2016-07-27 18:25:08,797 - dedupe_records - INFO - duplicate threshold = 0.827182
2016-07-27 18:25:09,549 - dedupe_records - INFO - found 361 duplicate clusters
2016-07-27 18:25:09,834 - dedupe_records - INFO - upserted status

In [19]:
query = """
SELECT canonical_citation_id, array_agg(citation_id) AS citation_ids, AVG(duplicate_score) AS avg_score
FROM duplicates
GROUP BY 1 HAVING AVG(duplicate_score) < 0.9 ORDER BY 1 ASC
LIMIT 1
"""
dupes = list(pgdb.run_query(query))[0]
print('citations {} are duplicates with avg. duplicate score = {}'.format(
        dupes['citation_ids'], round(dupes['avg_score'], 6)))

query = """
SELECT citation_id, authors, title, abstract, publication_year, keywords
FROM citations
WHERE citation_id = ANY(%(citation_ids)s)
"""
for record in pgdb.run_query(query, {'citation_ids': dupes['citation_ids']}):
    cipy.utils.present_citation(record)

citations [497, 496] are duplicates with avg. duplicate score = 0.827182

TITLE:    INTEGRATIVE SOCIAL WORK APPROACH AS A CONTEXT FOR UNDERSTANDING THE INDIVIDUAL SOCIAL CARE PLAN
YEAR:     2009
AUTHORS:  Ajdukovic, M; Urbanc, K
ABSTRACT: The article deals with the issue of introducing the individual social core plan as one of the initiatives pertaining to the long-awaited social core system reform. The ideas of the individual social care plan are placed within a theoretical framework of the integrative social work approach and in the context of numerous changes which have occurred in the lost twenty years at the conceptual and practical level of offering integrated and coordinated services to service users, ie case management, care management, person-centred planning, etc. Based on the experience that, in order to change the organisation of the Centres for Social Care and develop efficient social care services, the changes to the legal framework are not sufficient but it is necessary 

---

## Initial Ranking of Citations

- sample citations ranked by overlap with keyterms; user pre-screens citations until 10 have been included and 10 have been excluded
- based on included/excluded citations, rank citations by ratio of relevant to irrelevant keyterms and present those most likely to be relevant to the user for pre-screening

In [6]:
%run ../scripts/screen_citations.py --user_id=1 --review_id=1 --auto

2016-07-27 19:44:04,357 - screen_citations - INFO - 267 citations included, 9733 citations excluded


---

## Refinement of Search Keyterms

- based on included/excluded citations, create lists of strongly relevant and irrelevant keyterms that can be used to refine initial set of keyterms

In [7]:
import textacy

def combine_citation_text(record):
    title = record['title'] or ''
    abstract = record['abstract'] or ''
    keywords = '; '.join(record['keywords']) if record['keywords'] else ''
    text = '\n\n'.join((title, abstract, keywords)).strip()
    return text

query = """
SELECT t1.title, t1.abstract, t1.keywords, t2.status
FROM
    citations AS t1,
    citation_status AS t2
WHERE
    t1.review_id = %(review_id)s
    AND t1.citation_id = t2.citation_id
    AND t2.citation_screening IS NOT NULL
"""

records = list(pgdb.run_query(query, {'review_id': 1}))
print('# records =',len(records))
docs = (textacy.TextDoc(combine_citation_text(record), lang='en') for record in records)
terms_lists = (doc.as_terms_list() for doc in docs)
included = [record['status'] == 'included' for record in records]
included_keyterms, excluded_keyterms = textacy.keyterms.most_discriminating_terms(
    terms_lists, included, top_n_terms=25)

# records = 10000


In [8]:
print('{0:<30} | {1:<30}'.format('top "included" terms', 'top "excluded" terms'))
print('-' * 62)
for i in range(25):
    print('{0:<30} | {1:<30}'.format(included_keyterms[i], excluded_keyterms[i]))

top "included" terms           | top "excluded" terms          
--------------------------------------------------------------
conservation                   | health                        
forest                         | human health                  
protect area                   | human                         
livelihood                     | risk                          
local                          | water                         
protect                        | range                         
park                           | specie                        
biodiversity                   | urban                         
local people                   | exposure                      
national park                  | time                          
community                      | pollution                     
participation                  | student                       
forest management              | content                       
benefit                        | predict 

In [9]:
query = """
UPDATE citation_status
SET
    status = CASE WHEN CAST(deduplication->>'is_duplicate' AS boolean) IS FALSE THEN 'included' ELSE 'excluded' END,
    citation_screening = NULL
WHERE
    citation_screening IS NOT NULL
"""
pgdb.execute(query)