1. Ingest citation data via uploaded RIS or BibTex files or via manually-entered, individual records
1. Parse citation data according to input format and standardize (across formats) field names and values as much as possible
1. Munge parsed data into a convenient format for importing into a database, e.g. CSV -> Postgres (see [here](https://www.postgresql.org/docs/9.5/static/sql-copy.html))
1. Import citations into database with additional columns for, e.g. citation_id, project_id, user_id, is_duplicate (NULL to start), confirmed_duplicate, ...
1. Apply trained dedupe model to new citations vs. existing citations for given project, find possible matches, interactively prompt user to confirm duplicates when in doubt; mark duplicate records in the db accordingly

In [1]:
import io
import logging
import os

import dedupe

import cipy

In [2]:
logger = logging.getLogger()
logging.basicConfig()

In [3]:
citations_ddl = cipy.db.get_ddl('citations')
conn_creds = cipy.db.get_conn_creds('DATABASE_URL')
citations_db = cipy.db.PostgresDB(conn_creds, citations_ddl)

In [4]:
project_id = 0

In [5]:
deduper = cipy.db.get_deduper('../models/dedupe_citations_settings')

In [6]:
query = """
SELECT citation_id, authors, title, abstract, publication_year, doi
FROM citations
WHERE project_id = %(project_id)s
ORDER BY random()
LIMIT 10000
"""

results = citations_db.run_query(query, {'project_id': project_id})

dupe_threshold = deduper.threshold({row['citation_id']: cipy.db.db_utils.make_immutable(row)
                                    for row in results},
                                   recall_weight=0.5)
dupe_threshold

0.82794285

In [7]:
# dupe_confirm_threshold = 0.75 * dupe_threshold
# dupe_confirm_threshold

In [34]:
list(citations_db.run_query('SELECT citation_id FROM duplicates'))

[{'citation_id': 1028},
 {'citation_id': 1029},
 {'citation_id': 3078},
 {'citation_id': 3079},
 {'citation_id': 1037},
 {'citation_id': 1038},
 {'citation_id': 14},
 {'citation_id': 15},
 {'citation_id': 3096},
 {'citation_id': 3095},
 {'citation_id': 688},
 {'citation_id': 687},
 {'citation_id': 15392},
 {'citation_id': 15393},
 {'citation_id': 4130},
 {'citation_id': 4131},
 {'citation_id': 42},
 {'citation_id': 43},
 {'citation_id': 3120},
 {'citation_id': 8088},
 {'citation_id': 3123},
 {'citation_id': 3124},
 {'citation_id': 59},
 {'citation_id': 53},
 {'citation_id': 6706},
 {'citation_id': 5175},
 {'citation_id': 10296},
 {'citation_id': 10297},
 {'citation_id': 10298},
 {'citation_id': 1081},
 {'citation_id': 1082},
 {'citation_id': 4154},
 {'citation_id': 4155},
 {'citation_id': 2107},
 {'citation_id': 12731},
 {'citation_id': 3137},
 {'citation_id': 3138},
 {'citation_id': 5187},
 {'citation_id': 5188},
 {'citation_id': 1092},
 {'citation_id': 1093},
 {'citation_id': 1097},


In [31]:
def get_candidate_dupes(citations_db, project_id):
    """
    Args:
        citations_db (cipy.db.PostgresDB)
        project_id (int)
        
    Yields:
        list[tuple]
    """
    query = """
    SELECT
        t1.citation_id, t1.authors, t1.title, t1.abstract, t1.publication_year, t1.doi,
        t2.block_id, t2.smaller_ids
    FROM
        citations AS t1,
        dedupe_smaller_coverage AS t2
    WHERE
        project_id = %(project_id)s
        AND t1.citation_id = t2.citation_id
        AND t1.citation_id NOT IN (SELECT citation_id
                                   FROM duplicates
                                   WHERE project_id = %(project_id)s)
    ORDER BY t2.block_id
    """
    results = citations_db.run_query(query, {'project_id': project_id})
    
    block_id = None
    records = []
    for row in results:
        if row['block_id'] != block_id:
            if records:
                yield records

            block_id = row['block_id']
            records = []

        smaller_ids = row['smaller_ids']
        if smaller_ids:
            smaller_ids = set(smaller_ids.split(','))
        else:
            smaller_ids = set()
        
        records.append((row['citation_id'],
                        cipy.db.make_immutable(row),
                        smaller_ids))

    if records:
        yield records

In [32]:
for records in get_candidate_dupes(citations_db, 0):
    print(records)
    break

[(17429, {'block_id': 1, 'smaller_ids': '', 'title': 'Building financially sustainable incentives for environmental conservation into small enterprise development', 'publication_year': 2002, 'doi': None, 'authors': ('Millard, E',), 'citation_id': 17429, 'abstract': 'The countries facing the largest challenges of unemployment, population growth and poverty are also those that contain the most important biological diversity. Many economic growth policies threaten the environment, through exploitation of natural resources and large industrial development that earns foreign exchange. These in turn increase poverty, as deforestation reduces soil quality, over-harvesting the land and sea reduces food security and people lose access to traditional resources. The challenge is to identify approaches to development that conserve the environment. Recent initiatives demonstrate that incentives can be created for small enterprises to invest in conservation in a financially sustainable way.'}, set()

In [10]:
records[0]

(17429,
 {'abstract': 'The countries facing the largest challenges of unemployment, population growth and poverty are also those that contain the most important biological diversity. Many economic growth policies threaten the environment, through exploitation of natural resources and large industrial development that earns foreign exchange. These in turn increase poverty, as deforestation reduces soil quality, over-harvesting the land and sea reduces food security and people lose access to traditional resources. The challenge is to identify approaches to development that conserve the environment. Recent initiatives demonstrate that incentives can be created for small enterprises to invest in conservation in a financially sustainable way.',
  'authors': ('Millard, E',),
  'block_id': 1,
  'citation_id': 17429,
  'doi': None,
  'publication_year': 2002,
  'smaller_ids': '',
  'title': 'Building financially sustainable incentives for environmental conservation into small enterprise develo

In [35]:
clustered_dupes = deduper.matchBlocks(get_candidate_dupes(citations_db, 0),
                                      threshold=dupe_threshold)

In [12]:
for cluster, scores in clustered_dupes:
    if scores[0] != scores[1]:
        print(cluster, scores)
#     cluster_id = cluster[0]
#     for citation_id, score in zip(cluster, scores):
#         print(citation_id, score)

(10296, 10297, 10298) [ 0.92679411  0.92115211  0.92679411]
(5815, 5816, 5817) [ 0.99997786  0.99997643  0.99997753]
(2596, 2597, 2599) [ 0.9320623   0.96693608  0.96510524]


In [36]:
len(clustered_dupes)

0

In [14]:
from operator import itemgetter
import numpy as np

In [15]:
query = """
SELECT
    citation_id,
    ((CASE WHEN authors IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN title IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN abstract IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN publication_year IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN doi IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN type_of_work IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN publication_month IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN keywords IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN journal_name IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN type_of_reference IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN volume IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN issue_number IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN issn IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN publisher IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN language IS NULL THEN 1 ELSE 0 END)) AS n_null_cols
FROM citations
WHERE
    project_id = %(project_id)s
    AND citation_id IN ({})
ORDER BY n_null_cols ASC
LIMIT 1
"""

insert_values = []

for cids, scores in clustered_dupes:

    citation_duplicate_scores = {cid: score
                                 for cid, score in zip(cids, scores)}
    
    canonical_citation = citations_db.run_query(
        query.format(','.join(str(cid) for cid in cids)),
        {'project_id': project_id})
    canonical_citation_id = tuple(canonical_citation)[0]['citation_id']
    
    insert_values.extend(
        (citation_id, project_id, canonical_citation_id, duplicate_score, False, None)
        for citation_id, duplicate_score in citation_duplicate_scores.items())

In [28]:
import csv
import tempfile

csv_file = tempfile.NamedTemporaryFile(
    prefix='duplicates_', delete=False, mode='wt')
csv_writer = csv.writer(csv_file)

for cids, scores in clustered_dupes:
    citation_duplicate_scores = {cid: score
                                 for cid, score in zip(cids, scores)}
    canonical_citation = citations_db.run_query(
        query.format(','.join(str(cid) for cid in cids)),
        {'project_id': project_id})
    canonical_citation_id = tuple(canonical_citation)[0]['citation_id']
    
    for citation_id, duplicate_score in citation_duplicate_scores.items():
        csv_writer.writerow(
            (citation_id, project_id, canonical_citation_id, duplicate_score, False, None))

csv_file.close()

In [29]:
import psycopg2

In [30]:
try:
    with io.open(csv_file.name, mode='rt') as f:
        with duplicates_db.conn.cursor() as cur:
            cur.copy_expert('COPY duplicates FROM STDIN CSV', f)
except psycopg2.DataError as e:
    print(e)
    #LOGGER.exception()

In [27]:
os.remove(csv_file.name)

In [16]:
len(insert_values)

726

In [17]:
insert_values[0]

(1028, 0, 1028, 0.99991339, False)

In [18]:
duplicates_ddl = cipy.db.get_ddl('duplicates')
duplicates_db = cipy.db.PostgresDB(conn_creds, duplicates_ddl)

In [19]:
duplicates_db.create_table()

In [None]:
query = cur.copy_from(f, 'test', columns=('num', 'data'))

with duplicates_db.conn.cursor() as cur:
    cur.copy_from()

In [38]:
import json

with open('/Users/burtondewilde/Desktop/datakind/ci/conservation-intl/models/dedupe_citations_training.json') as f:
    training = json.load(f)

In [49]:
cids = [record['citation_id']
        for pair in training['distinct']
        for record in pair['__value__']]
print(len(cids))
print(len(set(cids)))

42
38


In [40]:
with open('/Users/burtondewilde/Desktop/bad_dedupe_citations_training.json') as f:
    bad_training = json.load(f)

In [56]:
training['distinct'][0]['__value__'][0].keys()

dict_keys(['citation_id', 'title', 'publication_year', 'doi', 'abstract', 'authors'])

In [55]:
bad_training['distinct'][0]['__value__'][0].keys()

dict_keys(['publication_year', 'doi', 'abstract', 'title', 'authors'])

In [57]:
cids = [record.get('citation_id')
        for pair in bad_training['distinct']
        for record in pair['__value__']]
print(len(cids))
print(len(set(cids)))

52
11


In [58]:
cids

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 11943,
 11944,
 10597,
 25946,
 449,
 2854,
 13127,
 20424,
 13464,
 13465]