In [22]:
import json
import logging
import re

import pandas as pd
import numpy as np
import textacy

import cipy

In [2]:
logger = logging.getLogger('cipy')
logger.setLevel(logging.DEBUG)

In [3]:
conn_creds = cipy.db.get_conn_creds('DATABASE_URL')
pgdb = cipy.db.PostgresDB(conn_creds, ddl='citations')

In [113]:
query = """
SELECT citation_id, authors, title, abstract, publication_year, doi
FROM citations
WHERE
    (citation_id NOT IN (SELECT citation_id FROM duplicates)
    OR citation_id IN (SELECT canonical_citation_id FROM duplicates))
"""
records = [cipy.db.make_immutable(row) for row in pgdb.run_query(query)]
print(len(records))
records[0]

28343


{'abstract': "Broomrapes (Orobanche spp.) are aggressive and damaging parasitic weeds which have a tremendous impact on agriculture in East Africa, the Mediterranean region and the Middle East. Despite the availability of technologies to control broomrapes in economically important crops, Orobanche infestation continues to increase, threatening the livelihoods of millions of farmers. Many of the technologies developed have not been effectively disseminated and there has been little or zero adoption by farmers-who continue to use ineffective management practices that exacerbate the problem. The adaptation and dissemination of appropriate management practices are major priorities in broomrape control. However, such work must take into consideration the specific socio-economic characteristics of individual farming systems. Orobanche is a community threat and effective management requires a community-based integrated management approach. Recognizing the central role of farmers in parasitic

In [64]:
filepath = '/Users/burtondewilde/Desktop/datakind/ci/conservation-intl/data/raw/Combined Search_Results_Top_3.xls'
df = pd.read_excel(filepath)
df.head(3)

Unnamed: 0,Author,Year,Title,Abstract,Included,Abstract review needed,Unresolved,Excluded non-conservation,Excluded non nature-based intervention,Excluded - no conservation intervention,Excluded - other,Excluded - no outcome,Excluded - biophysical outcome,Excluded - other.1,Excluded - study design,Excluded Subject,Comments,Journal
0,,1977.0,The dietitian in primary health care,This statement was prepared by The American Di...,,,,,,,,,,,,,,Journal of the American Dietetic Association
1,,1986.0,Life-styles and health,A new perspective is needed on lifestyles and ...,,,,,,,,,,,,,,Social Science and Medicine
2,,1991.0,Supportive environments for health: The Sundsv...,The Sundsvall Conference on supportive environ...,,,,,,,,,,,,,,Health Promotion International


In [65]:
df = df.rename(columns={'Author': 'authors', 'Year': 'publication_year',
                   'Abstract': 'abstract', 'Title': 'title'})
df.head(3)

Unnamed: 0,authors,publication_year,title,abstract,Included,Abstract review needed,Unresolved,Excluded non-conservation,Excluded non nature-based intervention,Excluded - no conservation intervention,Excluded - other,Excluded - no outcome,Excluded - biophysical outcome,Excluded - other.1,Excluded - study design,Excluded Subject,Comments,Journal
0,,1977.0,The dietitian in primary health care,This statement was prepared by The American Di...,,,,,,,,,,,,,,Journal of the American Dietetic Association
1,,1986.0,Life-styles and health,A new perspective is needed on lifestyles and ...,,,,,,,,,,,,,,Social Science and Medicine
2,,1991.0,Supportive environments for health: The Sundsv...,The Sundsvall Conference on supportive environ...,,,,,,,,,,,,,,Health Promotion International


In [67]:
min_new_citation_id = max(record['citation_id'] for record in records) + 1
min_new_citation_id

28710

In [103]:
included_records = df[df['Included'].notnull()][['authors', 'title', 'abstract', 'publication_year']]
included_records = json.loads(included_records.to_json(orient='records'))

# clean em up
tmp = []
for i, record in enumerate(included_records):
    
    publication_year = record['publication_year']
    authors = record['authors'] if record.get('authors') else ''
    abstract = record['abstract'] if record.get('abstract') else ''
    title = record['title'] if record.get('title') else ''
    doi = record['doi'] if record.get('doi') else None
    
    record = {'citation_id': i + min_new_citation_id}
    
    record['publication_year'] = int(publication_year) if publication_year else None
    if authors:
        record['authors'] = sorted(re.sub(r'(?<=\w)\.(?=( |$))', '', textacy.preprocess_text(author, fix_unicode=True))
                                   for author in re.split(r'\s*;\s*', authors))
    else:
        record['authors'] = None
    if abstract:
        record['abstract'] = textacy.preprocess_text(abstract, fix_unicode=True)
    else:
        record['abstract'] = None
    if title:
        record['title'] = textacy.preprocess_text(title, fix_unicode=True)
    else:
        record['title'] = None
    record['doi'] = doi
        
    tmp.append(record)
    
included_records = [cipy.db.make_immutable(record) for record in tmp]
    
included_records[:1]

[{'abstract': "This article examines the recent convergence of community-based and transboundary natural resource management in Africa. We suggest that both approaches have potential application to common-pool resources such as floodplain fisheries. However, a merging of transboundary and community-based management may reinforce oversimplifications about heterogeneity in resources, users, and institutions. A scalar mismatch between the ecosystem of concern in transboundary management and local resources of concern in community-based management, as well as different colonial and post-colonial histories contribute to this heterogeneity. We describe a fishery shared by Namibia and Zambia in terms of hybrid fisheries management. We examine settlement patterns, fishermen characteristics, sources of conflict, and perceptions regarding present and potential forms of fisheries management in the area. We also consider the implications that initiatives to manage resources on the local and ecosys

In [114]:
all_records = records + included_records

In [126]:
# load deduper
deduper_path = '/Users/burtondewilde/Desktop/datakind/ci/conservation-intl/models/dedupe_citations_settings'
threshold = 0.5

deduper = cipy.db.get_deduper(deduper_path, num_cores=1)

In [None]:
clustered_dupes = deduper.match(
    {r['citation_id']: r for r in all_records},
    threshold)

In [None]:
len(included_records)

In [None]:
len(clustered_dupes)

In [None]:
for cluster in clustered_dupes:
    print(cluster)
    break