1. Ingest citation data via uploaded RIS or BibTex files or via manually-entered, individual records
1. Parse citation data according to input format and standardize (across formats) field names and values as much as possible
1. Munge parsed data into a convenient format for importing into a database, e.g. CSV -> Postgres (see [here](https://www.postgresql.org/docs/9.5/static/sql-copy.html))
1. Import citations into database with additional columns for, e.g. citation_id, project_id, user_id, is_duplicate (NULL to start), confirmed_duplicate, ...
1. Apply trained dedupe model to new citations vs. existing citations for given project, find possible matches, interactively prompt user to confirm duplicates when in doubt; mark duplicate records in the db accordingly

In [11]:
import io
import logging
import os

import dedupe

import cipy

In [12]:
logger = logging.getLogger()
logging.basicConfig()

In [13]:
citations_ddl = cipy.db.get_ddl('citations')
conn_creds = cipy.db.get_conn_creds('DATABASE_URL')
citations_db = cipy.db.PostgresDB(conn_creds, citations_ddl)

In [14]:
deduper = cipy.db.get_deduper('../models/dedupe_citations_settings')

In [15]:
query = """
SELECT citation_id, authors, title, abstract, publication_year, doi
FROM citations
WHERE project_id = %(project_id)s
"""

results = citations_db.run_query(query, {'project_id': 0})

dupe_threshold = deduper.threshold({row['citation_id']: cipy.db.db_utils.make_immutable(row)
                                    for row in results},
                                   recall_weight=0.5)
dupe_threshold

0.80366647

In [16]:
import time

def generate_candidates(results):
    
    start_time = time.time()

    lset = set

    block_id = None
    records = []
    i = 0
    for row in results:
        if row['block_id'] != block_id:
            if records:
                yield records

            block_id = row['block_id']
            records = []
            i += 1

            if i % 10000 == 0:
                print(i, "blocks")
                print(time.time() - start_time, "seconds")

        smaller_ids = row['smaller_ids']

        if smaller_ids:
            smaller_ids = lset(smaller_ids.split(','))
        else:
            smaller_ids = lset([])

        records.append((row['citation_id'],
                        {'authors': tuple(row['authors'] if row['authors'] else []),
                         'title': row.get('title'),
                         'abstract': row.get('abstract'),
                         'publication_year': row.get('publication_year'),
                         'doi': row.get('doi')},
                        smaller_ids))

    if records:
        yield records

In [17]:
query = """
SELECT
    citation_id, authors, title, abstract,
    publication_year, doi, block_id, smaller_ids
FROM smaller_coverage
INNER JOIN citations
USING (citation_id)
WHERE project_id = %(project_id)s
ORDER BY block_id
"""

results = citations_db.run_query(query, {'project_id': 0})

In [18]:
# for records in generate_candidates(results):
#     print(records)
#     break

In [19]:
clustered_dupes = deduper.matchBlocks(generate_candidates(results), threshold=dupe_threshold)

10000 blocks
0.9351649284362793 seconds
20000 blocks
1.4290659427642822 seconds


In [20]:
for cluster, scores in clustered_dupes:
    cluster_id = cluster[0]
    for citation_id, score in zip(cluster, scores):
        print(citation_id, score)
    break

1 1.0
1001 1.0
4011 1.0
5011 1.0


In [21]:
len(clustered_dupes)

11151

In [24]:
from operator import itemgetter
import numpy as np

In [35]:
query = """
SELECT
    citation_id,
    ((CASE WHEN authors IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN title IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN abstract IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN publication_year IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN doi IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN type_of_work IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN publication_month IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN keywords IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN journal_name IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN type_of_reference IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN volume IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN issue_number IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN issn IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN publisher IS NULL THEN 1 ELSE 0 END)
    + (CASE WHEN language IS NULL THEN 1 ELSE 0 END)) AS n_null_vols
FROM citations
WHERE
    project_id = %(project_id)s
    AND citation_id IN ({})
"""

for cluster, scores in clustered_dupes:
    citation_ids = ','.join(str(c) for c in cluster)
    citations = citations_db.run_query(query.format(citation_ids), {'project_id': 0})
    canonical_id = sorted(citations, key=itemgetter('n_null_vols'))[0]['citation_id']
    dupe_citation_ids = tuple(cid for cid in cluster if cid != canonical_id)
        
    break

In [36]:
canonical_id

4011

In [37]:
dupe_citation_ids

(1, 1001, 5011)

In [27]:
cluster

(1, 1001, 4011, 5011)

In [30]:
citations

[{'citation_id': 1, 'n_null_vols': 5},
 {'citation_id': 1001, 'n_null_vols': 5},
 {'citation_id': 4011, 'n_null_vols': 3},
 {'citation_id': 5011, 'n_null_vols': 6}]