In [4]:
import io
import json
import logging
from pprint import pprint
import re

import pandas as pd
import numpy as np
import textacy

import colandr



In [6]:
query = """
SELECT id, title, abstract, authors, pub_year AS publication_year, doi
FROM citations
WHERE (deduplication->>'is_duplicate')::bool is false
ORDER BY id ASC
"""
records = [colandr.lib.utils.make_record_immutable(dict(row))
           for row in colandr.lib.utils.execute_raw_sql_query(query)]
print(len(records))
records[:1]

28343


[{'abstract': None,
  'authors': ('113 Congress, Rep. McDermott',),
  'doi': None,
  'id': 1,
  'publication_year': None,
  'title': 'Managed Carbon Price Act of 2014, H.R. 4754'}]

In [3]:
filepath = '../data/raw/Combined Search_Results_Top_3.xls'
df = pd.read_excel(filepath)
df.rename(columns={'Author': 'authors', 'Year': 'publication_year',
                   'Abstract': 'abstract', 'Title': 'title'},
          inplace=True)

df.head(2)

Unnamed: 0,authors,publication_year,title,abstract,Included,Abstract review needed,Unresolved,Excluded non-conservation,Excluded non nature-based intervention,Excluded - no conservation intervention,Excluded - other,Excluded - no outcome,Excluded - biophysical outcome,Excluded - other.1,Excluded - study design,Excluded Subject,Comments,Journal
0,,1977.0,The dietitian in primary health care,This statement was prepared by The American Di...,,,,,,,,,,,,,,Journal of the American Dietetic Association
1,,1986.0,Life-styles and health,A new perspective is needed on lifestyles and ...,,,,,,,,,,,,,,Social Science and Medicine


In [13]:
min_new_citation_id = max(record['id'] for record in records) + 1

included_records = df[df['Included'].notnull()][['authors', 'title', 'abstract', 'publication_year']]
included_records = json.loads(included_records.to_json(orient='records'))

# clean em up
tmp = []
for i, record in enumerate(included_records):
    
    publication_year = record['publication_year']
    authors = record['authors'] if record.get('authors') else ''
    abstract = record['abstract'] if record.get('abstract') else ''
    title = record['title'] if record.get('title') else ''
    doi = record['doi'] if record.get('doi') else None
    
    record = {'id': i + min_new_citation_id}
    
    record['publication_year'] = int(publication_year) if publication_year else None
    if authors:
        record['authors'] = sorted(re.sub(r'(?<=\w)\.(?=( |$))', '', textacy.preprocess_text(author, fix_unicode=True))
                                   for author in re.split(r'\s*;\s*', authors))
    else:
        record['authors'] = None
    if abstract:
        record['abstract'] = textacy.preprocess_text(abstract, fix_unicode=True)
    else:
        record['abstract'] = None
    if title:
        record['title'] = textacy.preprocess_text(title, fix_unicode=True)
    else:
        record['title'] = None
    record['doi'] = doi
        
    tmp.append(record)
    
included_records = [colandr.lib.utils.make_record_immutable(record) for record in tmp]
    
included_records[:1]

797 "included" records


[{'abstract': "This article examines the recent convergence of community-based and transboundary natural resource management in Africa. We suggest that both approaches have potential application to common-pool resources such as floodplain fisheries. However, a merging of transboundary and community-based management may reinforce oversimplifications about heterogeneity in resources, users, and institutions. A scalar mismatch between the ecosystem of concern in transboundary management and local resources of concern in community-based management, as well as different colonial and post-colonial histories contribute to this heterogeneity. We describe a fishery shared by Namibia and Zambia in terms of hybrid fisheries management. We examine settlement patterns, fishermen characteristics, sources of conflict, and perceptions regarding present and potential forms of fisheries management in the area. We also consider the implications that initiatives to manage resources on the local and ecosys

In [18]:
all_records = {r['id']: r for r in records + included_records}
print('# all records = {}'.format(len(all_records)))

# all records = 29140


In [15]:
# load deduper
settings_path = '../colandr_data/dedupe/dedupe_citations_settings'
threshold = 0.8
deduper = colandr.lib.utils.load_dedupe_model(settings_path, num_cores=1)

INFO:dedupe.api:((SimplePredicate: (commonIntegerPredicate, title), TfidfNGramCanopyPredicate: (0.8, title)), (SimplePredicate: (metaphoneToken, doi), SimplePredicate: (twoGramFingerprint, doi)), (SimplePredicate: (firstTokenPredicate, title), TfidfNGramCanopyPredicate: (0.6, title)), (SimplePredicate: (firstTokenPredicate, abstract), SimplePredicate: (lastSetElementPredicate, authors)))


In [16]:
clustered_dupes = deduper.match(all_records, threshold)

INFO:dedupe.index:Removing stop word In
INFO:dedupe.index:Removing stop word fl
INFO:dedupe.index:Removing stop word ue
INFO:dedupe.index:Removing stop word nc
INFO:dedupe.index:Removing stop word eo
INFO:dedupe.index:Removing stop word ff
INFO:dedupe.index:Removing stop word ar
INFO:dedupe.index:Removing stop word mm
INFO:dedupe.index:Removing stop word an
INFO:dedupe.index:Removing stop word ag
INFO:dedupe.index:Removing stop word em
INFO:dedupe.index:Removing stop word nt
INFO:dedupe.index:Removing stop word st
INFO:dedupe.index:Removing stop word le
INFO:dedupe.index:Removing stop word ad
INFO:dedupe.index:Removing stop word op
INFO:dedupe.index:Removing stop word ti
INFO:dedupe.index:Removing stop word no
INFO:dedupe.index:Removing stop word bi
INFO:dedupe.index:Removing stop word lo
INFO:dedupe.index:Removing stop word gi
INFO:dedupe.index:Removing stop word ca
INFO:dedupe.index:Removing stop word ll
INFO:dedupe.index:Removing stop word yi
INFO:dedupe.index:Removing stop word te


In [17]:
all_included_cids = {r['id'] for r in included_records}
matched_included_cids = set()
matched_cids = set()
for cluster_ids, cluster_scores in clustered_dupes:
    cids = tuple(cid for cid in cluster_ids if cid < min_new_citation_id)
    included_cids = tuple(cid for cid in cluster_ids if cid >= min_new_citation_id)
    if cids and included_cids:
        matched_included_cids.update(included_cids)
        matched_cids.update(cids)
        
print('# included records =', len(all_included_cids))
print('# matched included records =', len(matched_included_cids))

# included records = 797
# matched included records = 717


In [26]:
for i, (cluster_ids, cluster_scores) in enumerate(clustered_dupes):
    if (len(cluster_ids) == 2 and
        any(cid < min_new_citation_id for cid in cluster_ids) and
        any(cid >= min_new_citation_id for cid in cluster_ids)):
        
        print('\n')
        print('-' * 52)
        print('ids:', cluster_ids, 'scores:', cluster_scores)
        pprint(all_records[cluster_ids[0]])
        print()
        pprint(all_records[cluster_ids[1]])

    if i == 0:
        break



----------------------------------------------------
ids: (12288, 29299) scores: (0.99998337, 0.99998337)
{'abstract': 'Forest policies that devolve forest-use rights to local people '
             'have undergone development over the past few years in Laos. As '
             'collaboration between local people and forestry officials is '
             'seen as indispensable to effective and sustainable local forest '
             'management, the objective of this study is to clarify the issues '
             'pertinent to the resolution of latent conflict between these two '
             'stakeholders. The issues are examined by presenting two case '
             'studies in terms of forest management as perceived by local '
             'people and forestry officials; the first in a rich forest area '
             'and the second in a degraded forest. Issues relating to land and '
             'borders and social capital are identified as the most important '
             'in the d

In [31]:
known_statuses = [{'id': record['id'],
                   'status': 'included' if record['id'] in matched_cids else 'excluded'}
                  for record in records]
known_statuses[:10]

[{'id': 1, 'status': 'excluded'},
 {'id': 2, 'status': 'excluded'},
 {'id': 3, 'status': 'excluded'},
 {'id': 4, 'status': 'excluded'},
 {'id': 5, 'status': 'excluded'},
 {'id': 6, 'status': 'excluded'},
 {'id': 7, 'status': 'excluded'},
 {'id': 8, 'status': 'excluded'},
 {'id': 9, 'status': 'excluded'},
 {'id': 10, 'status': 'excluded'}]

In [32]:
filepath = '../colandr_data/citations/known_statuses.json'
with io.open(filepath, mode='wt') as f:
    json.dump(known_statuses, f, ensure_ascii=False, indent=2)