In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import logging

import cipy

logger = logging.getLogger()
handler = logger.handlers[0]
handler.setLevel(logging.CRITICAL)

conn_creds = cipy.db.get_conn_creds('DATABASE_URL')
pgdb = cipy.db.PostgresDB(conn_creds)

---

## User Management

- create new user accounts and store passwords in a secure format
- delete existing user accounts (along with any owned reviews)
- user login

In [None]:
%run ../scripts/create_user.py --test

In [None]:
list(pgdb.run_query('SELECT * from users'))

In [None]:
%run ../scripts/login_user.py

In [None]:
%run ../scripts/delete_user.py --user_id=3 --test

---

## Review Management

- create new reviews (with user as owner)
- delete existing owned reviews
- invite/uninvite other users to collaborate on existing reviews
- assign other user as owned review's new owner

In [None]:
%run ../scripts/create_review.py --user_id=1 --test

In [None]:
list(pgdb.run_query("SELECT * FROM reviews where review_id=1"))[0]

In [None]:
%run ../scripts/delete_review.py --user_id=1 --review_id=1 --test

In [None]:
%run ../scripts/manage_collaborators.py --owner_user_id=1 --review_id=1 --add_user_emails "samc@gmail.com"

In [None]:
print('review:', list(pgdb.run_query('SELECT review_id, user_ids FROM reviews WHERE review_id=1')))
print('user:', list(pgdb.run_query('SELECT user_id, review_ids FROM users WHERE user_id=3')))

In [None]:
%run ../scripts/manage_collaborators.py --owner_user_id=1 --review_id=1 --remove_user_emails "samc@gmail.com"

In [None]:
print('review:', list(pgdb.run_query('SELECT review_id, user_ids FROM reviews WHERE review_id=1')))
print('user:', list(pgdb.run_query('SELECT user_id, review_ids FROM users WHERE user_id=3')))

---

## Review Planning

- facilitate systematic review planning while also gathering structured data that informs and is informed by the citation pre-screening process; user entry of the following fields:
    - objective
    - research questions, ranked
    - PICO statements
    - grouped keyterms (with automatic boolean search query generation)
    - selection criteria, with shorthand labels
- automatically generate boolean search queries from given keyterms
- after enough citations have been screened, suggest good/bad keyterms for search query

In [None]:
%run ../scripts/plan_review.py --user_id=1 --review_id=1 --test

In [None]:
query = "SELECT keyterms FROM review_plans WHERE review_id = 1"
keyterms = list(pgdb.run_query(query))[0]['keyterms']
print(cipy.utils.get_boolean_search_query(keyterms))

---

## Citation Ingestion and De-duplication

- load citations from RIS or BibTex files then parse, standardize, sanitize, validate, and store the data
- identify duplicate citations using a sophisticated model and assign the most complete record in a set of duplicates as the "canonical" record

In [None]:
%run ../scripts/ingest_citations.py --citations ../data/raw/citation_files/phase_2_demo_citations.ris --user_id=1 --review_id=1 --test

In [None]:
num_citations = list(pgdb.run_query('SELECT COUNT(1) FROM citations WHERE review_id = 1'))[0]['count']
print('total # citations =', num_citations)

In [None]:
%run ../scripts/dedupe_records.py --review_id=1 --threshold=auto --settings=../models/dedupe_citations_settings --test

In [None]:
query = """
SELECT canonical_citation_id, array_agg(citation_id) AS citation_ids, AVG(duplicate_score) AS avg_score
FROM duplicates
GROUP BY 1 HAVING AVG(duplicate_score) > 0.9 ORDER BY 1 ASC
LIMIT 1
"""
dupes = list(pgdb.run_query(query))[0]
print('citations {} are duplicates with avg. duplicate score = {}'.format(
        dupes['citation_ids'], round(dupes['avg_score'], 6)))

query = """
SELECT citation_id, authors, title, abstract, publication_year, keywords
FROM citations
WHERE citation_id = ANY(%(citation_ids)s)
"""
for record in pgdb.run_query(query, {'citation_ids': dupes['citation_ids']}):
    cipy.utils.present_citation(record)

---

## Initial Ranking of Citations

- sample citations ranked by overlap with keyterms; user pre-screens citations until 10 have been included and 10 have been excluded
- based on included/excluded citations, rank citations by ratio of relevant to irrelevant keyterms and present those most likely to be relevant to the user for pre-screening

In [None]:
%run ../scripts/screen_citations.py --user_id=1 --review_id=1 --auto --test

---

## Refinement of Search Keyterms

- based on included/excluded citations, create lists of strongly relevant and irrelevant keyterms that can be used to refine initial set of keyterms

In [None]:
import textacy

def combine_citation_text(record):
    title = record['title'] or ''
    abstract = record['abstract'] or ''
    keywords = '; '.join(record['keywords']) if record['keywords'] else ''
    text = '\n\n'.join((title, abstract, keywords)).strip()
    return text

query = """
SELECT t1.title, t1.abstract, t1.keywords, t2.status
FROM
    citations AS t1,
    citation_status AS t2
WHERE
    t1.review_id = %(review_id)s
    AND t1.citation_id = t2.citation_id
    AND t2.citation_screening IS NOT NULL
"""

records = list(pgdb.run_query(query, {'review_id': 1}))
print('# records =',len(records))
docs = (textacy.TextDoc(combine_citation_text(record), lang='en') for record in records)
terms_lists = (doc.as_terms_list() for doc in docs)
included = [record['status'] == 'included' for record in records]
included_keyterms, excluded_keyterms = textacy.keyterms.most_discriminating_terms(
    terms_lists, included, top_n_terms=25)

In [None]:
print('{0:<30} | {1:<30}'.format('top "included" terms', 'top "excluded" terms'))
print('-' * 62)
for i in range(25):
    print('{0:<30} | {1:<30}'.format(included_keyterms[i], excluded_keyterms[i]))

In [None]:
query = """
UPDATE citation_status
SET
    status = CASE WHEN CAST(deduplication->>'is_duplicate' AS boolean) IS FALSE THEN 'included' ELSE 'excluded' END,
    citation_screening = NULL
WHERE
    citation_screening IS NOT NULL
"""
pgdb.execute(query)