In [26]:
from __future__ import absolute_import, division, print_function, unicode_literals

from collections import Counter, defaultdict
import logging
import os

import yaml

import cipy

In [2]:
logger = logging.getLogger('cipy')
logger.setLevel(logging.DEBUG)

---

## Read and Parse Citation Files

In [3]:
data_path = '/Users/burtondewilde/Desktop/datakind/ci/conservation-intl/data/raw/citation_files/dedupe_tests/'

In [4]:
# RIS FORMAT

ris_key_counts = defaultdict(lambda: defaultdict(int))
unique_ris_keys = Counter()

for fname in os.listdir(data_path):
    if fname.endswith('.ris') or fname.endswith('.txt'):
        print(fname)
        ris = cipy.parsers.RisFile(os.path.join(data_path, fname))
        unique_ris_keys.update(key
                               for record in ris.parse()
                               for key in record.keys())
        for record in ris.parse():
            for key, value in record.items():
                try:
                    ris_key_counts[key][value] += 1
                except TypeError:
                    print(key, value)
                    break

scopus0.ris
scopus1.ris

ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Perth, Aust
ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Washington, DC, USA
ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Nagoya, Jpn
ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Pittsburgh, PA, USA
ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Madison, WI, USA
ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Boston, MA, USA
ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Perth, Aust
ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Washington, DC, USA
ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Nagoya, Jpn
ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Pittsburgh, PA, USA
ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Madison, WI, USA
ERROR:cipy.parsers.ris:duplicate key erro


scopus2.ris
scopus3.ris
WoS0.txt
Wos1.txt
WoS2.txt
WoS3.txt
WoS4.txt
WoS5.txt
WoS6.txt
WoS7.txt
WoS8.txt

DEBUG:cipy.parsers.ris:unknown tag: tag=D2, line=9604 "D2 10.1007/978-3-540-73349-2"
DEBUG:cipy.parsers.ris:unknown tag: tag=D2, line=9604 "D2 10.1007/978-3-540-73349-2"





In [5]:
# BIBTEX FORMAT

bib_key_counts = defaultdict(lambda: defaultdict(int))
unique_bib_keys = Counter()
for fname in os.listdir(data_path):
    if fname.endswith('.bib'):
        print(fname)
        
        bib = cipy.parsers.BibTexFile(os.path.join(data_path, fname))
        unique_bib_keys.update(key
                               for record in bib.parse()
                               for key in record.keys())
        for record in bib.parse():
            for key, value in record.items():
                try:
                    bib_key_counts[key][value] += 1
                except TypeError:
                    print(key, value)
                    break

scopus0.bib
scopus1.bib

DEBUG:cipy.parsers.bibtex:unusual "pages" field value: i-ii+S1-S266
DEBUG:cipy.parsers.bibtex:unusual "pages" field value: b-141-64
DEBUG:cipy.parsers.bibtex:unusual "pages" field value: 066133-1-066133-5
DEBUG:cipy.parsers.bibtex:unusual "pages" field value: 181-190,193-199,203-241
DEBUG:cipy.parsers.bibtex:unusual "pages" field value: i-ii+S1-S266
DEBUG:cipy.parsers.bibtex:unusual "pages" field value: b-141-64
DEBUG:cipy.parsers.bibtex:unusual "pages" field value: 066133-1-066133-5
DEBUG:cipy.parsers.bibtex:unusual "pages" field value: 181-190,193-199,203-241



scopus2.bib
scopus3.bib
WoS0.bib
WoS1.bib
WoS2.bib
WoS3.bib
WoS4.bib
WoS5.bib
WoS6.bib
WoS7.bib
WoS8.bib


In [6]:
print(sorted(ris_key_counts.keys()))

['D2', 'abstract', 'access_date', 'article_number', 'author_addresses', 'author_keywords', 'authors', 'caption', 'conference_host', 'conference_location', 'custom_1', 'custom_2', 'custom_3', 'custom_7', 'document_delivery_number', 'document_type', 'doi', 'electronic_intl_issn', 'email_address', 'end_page', 'funding_agency_and_grants', 'funding_text', 'isbn', 'issn', 'issue_number', 'journal_name', 'journal_name_user_abbr_2', 'keywords', 'language', 'name_of_database', 'notes', 'num_cited_references', 'num_times_cited', 'open_researcher_contributor_id', 'page_count', 'pages', 'part_number', 'place_published', 'publication_month', 'publication_type', 'publication_year', 'publisher', 'publisher_address', 'publisher_city', 'pubmed_id', 'reference_id', 'reprint_status', 'reviewed_item', 'secondary_authors', 'secondary_title', 'section', 'short_title', 'source_abbr_29char', 'source_abbr_iso', 'source_name', 'special_issue', 'start_page', 'subject_categories', 'subject_categories_alt', 'subsi

In [7]:
print(sorted(bib_key_counts.keys()))

['ENTRYTYPE', 'abbrev_source_title', 'abstract', 'affiliation', 'art_number', 'article-number', 'author_keywords', 'authors', 'book-group-author', 'booktitle', 'chemicals_cas', 'coden', 'correspondence_address1', 'doi', 'editor', 'eissn', 'funding_details', 'isbn', 'issn', 'issue_number', 'journal_name', 'keywords', 'language', 'link', 'manufacturers', 'molecular_seqnumbers', 'notes', 'orcid-numbers', 'organization', 'page_count', 'pages', 'publication_month', 'publication_year', 'publisher', 'publisher_address', 'pubmed_id', 'reference_id', 'references', 'researcherid-numbers', 'series', 'source', 'sponsors', 'title', 'tradenames', 'type_of_work', 'unique-id', 'volume']


In [8]:
for key, value in sorted(ris_key_counts.items()):
    try:
        print('{0:.<30} {1}'.format(key, max(len(str(val)) for val in value.keys())))
    except TypeError:
        print('{0:.<30} {1}'.format(key, max(len(str(v)) for val in value.keys() for v in val)))

D2............................ 25
abstract...................... 11689
access_date................... 19
article_number................ 29
author_addresses.............. 31568
author_keywords............... 534
authors....................... 88763
caption....................... 180
conference_host............... 45
conference_location........... 62
custom_1...................... 66902
custom_2...................... 10
custom_3...................... 166
custom_7...................... 15
document_delivery_number...... 5
document_type................. 26
doi........................... 68
electronic_intl_issn.......... 9
email_address................. 113
end_page...................... 19
funding_agency_and_grants..... 1673
funding_text.................. 7031
isbn.......................... 17
issn.......................... 9
issue_number.................. 15
journal_name.................. 105
journal_name_user_abbr_2...... 105
keywords...................... 4330
language...................

In [9]:
for key, value in sorted(bib_key_counts.items()):
    try:
        print('{0:.<30} {1}'.format(key, max(len(str(val)) for val in value.keys())))
    except TypeError:
        print('{0:.<30} {1}'.format(key, max(len(str(v)) for val in value.keys() for v in val)))
    except ValueError:
        print('{0:.<30} ValueError'.format(key))

ENTRYTYPE..................... 13
abbrev_source_title........... 105
abstract...................... 11691
affiliation................... 30922
art_number.................... 15
article-number................ 25
author_keywords............... 526
authors....................... 46917
book-group-author............. 13
booktitle..................... 107
chemicals_cas................. 1551
coden......................... 5
correspondence_address1....... 345
doi........................... 82
editor........................ 68
eissn......................... 9
funding_details............... 1557
isbn.......................... 40
issn.......................... 9
issue_number.................. 15
journal_name.................. 166
keywords...................... 4286
language...................... 19
link.......................... 113
manufacturers................. 350
molecular_seqnumbers.......... 5497
notes......................... 320
orcid-numbers................. 4356
organization............

In [10]:
for key, count in unique_ris_keys.most_common():
    print('{0:.<25} {1:>9}'.format(key, count))

title....................     12010
publication_year.........     12008
issn.....................     12007
authors..................     11986
volume...................     11744
abstract.................     11504
end_page.................     11119
doi......................     10602
issue_number.............      8976
language.................      8491
journal_name.............      8391
notes....................      8000
type_of_reference........      8000
name_of_database.........      8000
secondary_title..........      7999
type_of_work.............      7965
journal_name_user_abbr_2.      7891
author_addresses.........      7796
url......................      7713
keywords.................      7441
pages....................      7304
publisher................      4025
publication_type.........      4010
unique_identifier........      4010
source_name..............      4010
start_page...............      3962
publication_month........      3719
custom_2.................   

In [11]:
for key, count in unique_bib_keys.most_common():
    print('{0:.<25} {1:>9}'.format(key, count))

title....................     12010
reference_id.............     12010
ENTRYTYPE................     12010
publication_year.........     12008
journal_name.............     12000
authors..................     11985
volume...................     11742
issn.....................     11723
abstract.................     11505
pages....................     11141
doi......................     10588
issue_number.............      9040
notes....................      8205
source...................      8000
link.....................      8000
language.................      7991
type_of_work.............      7965
abbrev_source_title......      7891
affiliation..............      7796
correspondence_address1..      7569
references...............      7501
keywords.................      5894
coden....................      5286
author_keywords..........      4654
unique-id................      4010
publication_month........      3733
publisher................      3525
pubmed_id................   

In [12]:
sorted(ris_key_counts['pages'].items(), key=lambda x: x[1], reverse=True)

[('1', 265),
 ('28', 25),
 ('19', 25),
 ('54', 24),
 ('3', 24),
 ('9', 23),
 ('21', 23),
 ('11', 22),
 ('30', 21),
 ('10', 21),
 ('29', 21),
 ('33', 21),
 ('47', 21),
 ('143', 20),
 ('35', 20),
 ('37', 20),
 ('57', 19),
 ('45', 19),
 ('199', 19),
 ('53', 18),
 ('36', 18),
 ('7', 18),
 ('61', 18),
 ('24', 18),
 ('69', 18),
 ('185', 18),
 ('12', 18),
 ('71', 18),
 ('101', 18),
 ('31', 18),
 ('81', 17),
 ('44', 17),
 ('26', 17),
 ('141', 17),
 ('116', 17),
 ('23', 17),
 ('77', 17),
 ('65', 17),
 ('129', 17),
 ('17', 17),
 ('137', 17),
 ('64', 17),
 ('58', 17),
 ('51', 17),
 ('93', 17),
 ('32', 16),
 ('169', 16),
 ('43', 16),
 ('38', 16),
 ('62', 16),
 ('55', 16),
 ('59', 16),
 ('63', 16),
 ('225', 16),
 ('87', 15),
 ('15', 15),
 ('16', 15),
 ('102', 15),
 ('213', 15),
 ('73', 15),
 ('221', 15),
 ('223', 15),
 ('13', 15),
 ('41', 15),
 ('117', 15),
 ('231', 15),
 ('79', 15),
 ('155', 14),
 ('67', 14),
 ('83', 14),
 ('103', 14),
 ('40', 14),
 ('215', 14),
 ('42', 14),
 ('115', 14),
 ('110',

In [13]:
sorted(bib_key_counts['pages'].items(), key=lambda x: x[1], reverse=True)

[('1--9', 30),
 ('1--12', 21),
 ('1--10', 20),
 ('1--8', 16),
 ('1--11', 16),
 ('1--17', 15),
 ('1--13', 14),
 ('1--6', 14),
 ('1--15', 11),
 ('1--21', 11),
 ('1--7', 10),
 ('1--25', 9),
 ('1--14', 8),
 ('1--19', 8),
 ('11--22', 8),
 ('1--20', 8),
 ('37--44', 7),
 ('9--16', 6),
 ('1--29', 6),
 ('54--61', 6),
 ('35--43', 6),
 ('1--16', 6),
 ('1069--1075', 6),
 ('11--20', 6),
 ('7--18', 6),
 ('29--39', 6),
 ('377--384', 6),
 ('1--4', 6),
 ('3--14', 5),
 ('30--39', 5),
 ('31--38', 5),
 ('13--22', 5),
 ('36--43', 5),
 ('73--83', 5),
 ('92--101', 5),
 ('43--54', 5),
 ('62--66', 5),
 ('76--80', 5),
 ('1--18', 5),
 ('242--255', 5),
 ('27--36', 5),
 ('457--460', 5),
 ('26--34', 5),
 ('51--57', 5),
 ('116--128', 4),
 ('1--23', 4),
 ('92--100', 4),
 ('117--127', 4),
 ('197--200', 4),
 ('132--138', 4),
 ('257--266', 4),
 ('3--20', 4),
 ('35--41', 4),
 ('837--853', 4),
 ('377--381', 4),
 ('24--30', 4),
 ('53--60', 4),
 ('69--80', 4),
 ('26--33', 4),
 ('91--106', 4),
 ('39--48', 4),
 ('455--461', 4

In [6]:
record

{'ENTRYTYPE': 'article',
 'abstract': 'The amplitude Delta rho(T,H)/rho and temperature T-M, where the colossal\nmagnetoresistance (CMR) response of L(1-x)Ca(x)MnO(3) manganites are\nmaximum, are found to be controlled by the radius of the lanthanide\n(L(3+)) which modifies the bending of the Mn-O-Mn bond. Increasing the\nbond distortion lowers T-M and enhances Delta rho(T,H)/rho. Enhanced CMR\narises from (1) a shift to lon er temperatures of T-M, (2) a reduced\nmobility of the doping holes, and (3) an increase of the coupling\nbetween itinerant and localized electrons. The resistivity rho(H)\nfollows an approximate to BM(2)(H) law and the parameter B is also tuned\nby the Mn-O-Mn bond angle. The narrowing of the electronic bandwidth is\nthe fundamental parameter controlling the observed CMR.',
 'authors': ('Fontcuberta, J',
  'GarciaMunoz, JL',
  'Martinez, B',
  'Obradors, X',
  'Pinol, S',
  'Seffar, A'),
 'doi': '10.1103/PhysRevLett.76.1122',
 'issn': '0031-9007',
 'issue_number':

---

## Sanitize and Validate Records

In [11]:
def type_sanitizer(value, type_):
    try:
        casted_value = type_(value)
    except (ValueError, TypeError):
        return None
    return casted_value


def sanitize_integer(value, min_value=None, max_value=None):
    value = type_sanitizer(value, float)
    value = type_sanitizer(value, int)
    if value is not None and min_value is not None:
        value = value if value >= min_value else None
    if value is not None and max_value is not None:
        value = value if value <= max_value else None
    return value


def sanitize_string(value, max_length=None, truncate=True):
    value = type_sanitizer(value, str)
    if value and max_length and len(value) > max_length:
        if truncate:
            value = value[:max_length]
        else:
            value = None
    return value

In [15]:
field_sanitizers = {
    'type_of_work': lambda x: sanitize_string(x, max_length=25),
    'title': lambda x: sanitize_string(x, max_length=250),
    'secondary_title': lambda x: sanitize_string(x, max_length=250),
    'publication_year': lambda x: sanitize_integer(x, max_value=32767),
    'publication_month': lambda x: sanitize_integer(x, max_value=32767),
    'authors': lambda x: [sanitize_string(item, max_length=100) for item in x],
    'abstract': lambda x: sanitize_string(x),
    'keywords': lambda x: [sanitize_string(item, max_length=100) for item in x],
    'type_of_reference': lambda x: sanitize_string(x, max_length=50),
    'journal_name': lambda x: sanitize_string(x, max_length=100),
    'volume': lambda x: sanitize_string(x, max_length=20),
    'issue_number': lambda x: sanitize_string(x, max_length=20),
    'doi': lambda x: sanitize_string(x, max_length=100),
    'issn': lambda x: sanitize_string(x, max_length=20),
    'publisher': lambda x: sanitize_string(x, max_length=100),
    'language': lambda x: sanitize_string(x, max_length=50)
}


def sanitize_record(record):
    sanitized_record = {'other_fields': {}}
    for key, value in record.items():
        try:
            sanitized_record[key] = field_sanitizers[key](value)
        except KeyError:
            sanitized_record['other_fields'][key] = type_sanitizer(value, str)
    return sanitized_record

In [16]:
sanitize_record(record)

{'abstract': 'The amplitude Delta rho(T,H)/rho and temperature T-M, where the colossal\nmagnetoresistance (CMR) response of L(1-x)Ca(x)MnO(3) manganites are\nmaximum, are found to be controlled by the radius of the lanthanide\n(L(3+)) which modifies the bending of the Mn-O-Mn bond. Increasing the\nbond distortion lowers T-M and enhances Delta rho(T,H)/rho. Enhanced CMR\narises from (1) a shift to lon er temperatures of T-M, (2) a reduced\nmobility of the doping holes, and (3) an increase of the coupling\nbetween itinerant and localized electrons. The resistivity rho(H)\nfollows an approximate to BM(2)(H) law and the parameter B is also tuned\nby the Mn-O-Mn bond angle. The narrowing of the electronic bandwidth is\nthe fundamental parameter controlling the observed CMR.',
 'authors': ['Fontcuberta, J',
  'GarciaMunoz, JL',
  'Martinez, B',
  'Obradors, X',
  'Pinol, S',
  'Seffar, A'],
 'doi': '10.1103/PhysRevLett.76.1122',
 'issn': '0031-9007',
 'issue_number': '7',
 'journal_name': 'P

In [25]:
c = cipy.db.Citation(sanitize_record(record))
# c.validate()
c.to_native()

{'abstract': 'The amplitude Delta rho(T,H)/rho and temperature T-M, where the colossal\nmagnetoresistance (CMR) response of L(1-x)Ca(x)MnO(3) manganites are\nmaximum, are found to be controlled by the radius of the lanthanide\n(L(3+)) which modifies the bending of the Mn-O-Mn bond. Increasing the\nbond distortion lowers T-M and enhances Delta rho(T,H)/rho. Enhanced CMR\narises from (1) a shift to lon er temperatures of T-M, (2) a reduced\nmobility of the doping holes, and (3) an increase of the coupling\nbetween itinerant and localized electrons. The resistivity rho(H)\nfollows an approximate to BM(2)(H) law and the parameter B is also tuned\nby the Mn-O-Mn bond angle. The narrowing of the electronic bandwidth is\nthe fundamental parameter controlling the observed CMR.',
 'authors': ['Fontcuberta, J',
  'GarciaMunoz, JL',
  'Martinez, B',
  'Obradors, X',
  'Pinol, S',
  'Seffar, A'],
 'doi': '10.1103/PhysRevLett.76.1122',
 'insert_ts': datetime.datetime(2016, 6, 8, 23, 59, 37, 485215)