In [1]:
import io
import json
import logging
from pprint import pprint
import re

import pandas as pd
import numpy as np
import textacy

import cipy

In [2]:
logger = logging.getLogger('cipy')
logger.setLevel(logging.DEBUG)

In [3]:
conn_creds = cipy.db.get_conn_creds('DATABASE_URL')
pgdb = cipy.db.PostgresDB(conn_creds, ddl='citations')

In [4]:
query = """
SELECT citation_id, authors, title, abstract, publication_year, doi
FROM citations
WHERE
    (citation_id NOT IN (SELECT citation_id FROM duplicates)
    OR citation_id IN (SELECT canonical_citation_id FROM duplicates))
ORDER BY citation_id
"""
records = [cipy.db.make_immutable(row) for row in pgdb.run_query(query)]
print(len(records))
records[:1]

28343


[{'abstract': None,
  'authors': ('113 Congress, Rep. McDermott',),
  'citation_id': 1,
  'doi': None,
  'publication_year': None,
  'title': 'Managed Carbon Price Act of 2014, H.R. 4754'}]

In [10]:
filepath = '/Users/burtondewilde/Desktop/datakind/ci/conservation-intl/data/raw/Combined Search_Results_Top_3.xls'
df = pd.read_excel(filepath)

df.rename(columns={'Author': 'authors', 'Year': 'publication_year',
                   'Abstract': 'abstract', 'Title': 'title'},
          inplace=True)

df.head(3)

Unnamed: 0,authors,publication_year,title,abstract,Included,Abstract review needed,Unresolved,Excluded non-conservation,Excluded non nature-based intervention,Excluded - no conservation intervention,Excluded - other,Excluded - no outcome,Excluded - biophysical outcome,Excluded - other.1,Excluded - study design,Excluded Subject,Comments,Journal
0,,1977.0,The dietitian in primary health care,This statement was prepared by The American Di...,,,,,,,,,,,,,,Journal of the American Dietetic Association
1,,1986.0,Life-styles and health,A new perspective is needed on lifestyles and ...,,,,,,,,,,,,,,Social Science and Medicine
2,,1991.0,Supportive environments for health: The Sundsv...,The Sundsvall Conference on supportive environ...,,,,,,,,,,,,,,Health Promotion International


In [11]:
min_new_citation_id = max(record['citation_id'] for record in records) + 1
min_new_citation_id

28710

In [12]:
df.columns

Index(['authors', 'publication_year', 'title', 'abstract', 'Included',
       'Abstract review needed', 'Unresolved', 'Excluded non-conservation',
       'Excluded non nature-based intervention',
       'Excluded - no conservation intervention', 'Excluded - other',
       'Excluded - no outcome', 'Excluded - biophysical outcome',
       'Excluded - other.1', 'Excluded - study design', 'Excluded Subject',
       'Comments', 'Journal'],
      dtype='object')

In [13]:
included_records = df[df['Included'].notnull()][['authors', 'title', 'abstract', 'publication_year']]
included_records = json.loads(included_records.to_json(orient='records'))
print(len(included_records))

# clean em up
tmp = []
for i, record in enumerate(included_records):
    
    publication_year = record['publication_year']
    authors = record['authors'] if record.get('authors') else ''
    abstract = record['abstract'] if record.get('abstract') else ''
    title = record['title'] if record.get('title') else ''
    doi = record['doi'] if record.get('doi') else None
    
    record = {'citation_id': i + min_new_citation_id}
    
    record['publication_year'] = int(publication_year) if publication_year else None
    if authors:
        record['authors'] = sorted(re.sub(r'(?<=\w)\.(?=( |$))', '', textacy.preprocess_text(author, fix_unicode=True))
                                   for author in re.split(r'\s*;\s*', authors))
    else:
        record['authors'] = None
    if abstract:
        record['abstract'] = textacy.preprocess_text(abstract, fix_unicode=True)
    else:
        record['abstract'] = None
    if title:
        record['title'] = textacy.preprocess_text(title, fix_unicode=True)
    else:
        record['title'] = None
    record['doi'] = doi
        
    tmp.append(record)
    
included_records = [cipy.db.make_immutable(record) for record in tmp]
    
included_records[:1]

797


[{'abstract': "This article examines the recent convergence of community-based and transboundary natural resource management in Africa. We suggest that both approaches have potential application to common-pool resources such as floodplain fisheries. However, a merging of transboundary and community-based management may reinforce oversimplifications about heterogeneity in resources, users, and institutions. A scalar mismatch between the ecosystem of concern in transboundary management and local resources of concern in community-based management, as well as different colonial and post-colonial histories contribute to this heterogeneity. We describe a fishery shared by Namibia and Zambia in terms of hybrid fisheries management. We examine settlement patterns, fishermen characteristics, sources of conflict, and perceptions regarding present and potential forms of fisheries management in the area. We also consider the implications that initiatives to manage resources on the local and ecosys

In [35]:
all_records = {r['citation_id']: r for r in records + included_records}

In [36]:
len(all_records)

29140

In [37]:
# load deduper
deduper_path = '/Users/burtondewilde/Desktop/datakind/ci/conservation-intl/models/dedupe_citations_settings'
threshold = 0.8

deduper = cipy.db.get_deduper(deduper_path, num_cores=1)

In [38]:
clustered_dupes = deduper.match(
    all_records,
    threshold)

In [39]:
all_included_cids = {r['citation_id'] for r in included_records}
matched_included_cids = set()
matched_cids = set()
for cluster_ids, cluster_scores in clustered_dupes:
    cids = tuple(cid for cid in cluster_ids if cid < min_new_citation_id)
    included_cids = tuple(cid for cid in cluster_ids if cid >= min_new_citation_id)
    if cids and included_cids:
        matched_included_cids.update(included_cids)
        matched_cids.update(cids)
        
print('# included records =', len(all_included_cids))
print('# matched included records =', len(matched_included_cids))

# included records = 797
# matched included records = 718


In [41]:
included_or_not = [{'citation_id': record['citation_id'],
                    'included': record['citation_id'] in matched_cids}
                   for record in records]
included_or_not[:10]

[{'citation_id': 1, 'included': False},
 {'citation_id': 2, 'included': False},
 {'citation_id': 3, 'included': False},
 {'citation_id': 4, 'included': False},
 {'citation_id': 5, 'included': False},
 {'citation_id': 6, 'included': False},
 {'citation_id': 7, 'included': False},
 {'citation_id': 8, 'included': False},
 {'citation_id': 9, 'included': False},
 {'citation_id': 10, 'included': False}]

In [44]:
filepath = '/Users/burtondewilde/Desktop/datakind/ci/conservation-intl/data/processed/citation_selection.json'
with io.open(filepath, mode='wt') as f:
    json.dump(included_or_not, f, ensure_ascii=False, indent=4)

In [42]:
for i, (cluster_ids, cluster_scores) in enumerate(clustered_dupes):
    if (len(cluster_ids) == 2 and
        any(cid < min_new_citation_id for cid in cluster_ids) and
        any(cid >= min_new_citation_id for cid in cluster_ids)):
        
        print('\n')
        print('-' * 52)
        print('ids:', cluster_ids, 'scores:', cluster_scores)
        pprint(all_records[cluster_ids[0]])
        print()
        pprint(all_records[cluster_ids[1]])

    if i > 3:
        break



----------------------------------------------------
ids: (12288, 29299) scores: (0.99998337, 0.99998337)
{'abstract': 'A progressive part of protected area management program in Lao '
             'PDR is a land and forest allocation program which contains '
             'critical elements that delegate right of land and forest use to '
             'local people. This study analyzes the gap between the original '
             'intent of the program and its actual implementation by local '
             'officials in Phou Xang He protected area, and discusses policy '
             'issues that need to be addressed. It appears that several types '
             'of non-compliance with regulations by local people have '
             'occurred, with local officials tacitly ignoring infractions. By '
             'switching viewpoints, however, it appears that the local '
             'officials permit these infractions as a way of allowing local '
             'people to secure their liv