In [1]:
from future.builtins import next
import os
import csv
import re
import logging

import dedupe
from unidecode import unidecode

In [2]:
# ## Logging
logging.getLogger().setLevel(logging.WARN)

In [57]:
def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """
    # If data is missing, indicate that by setting the value to `None`
    if not column:
        column = None
    return column
    
    try : # python 2/3 string differences
        column = column.decode('utf8')
    except AttributeError:
        pass
    column = unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    # If data is missing, indicate that by setting the value to `None`
    if not column:
        column = None
    return column

def readData(filename, encoding, delimiter, header, keyfield, maxRecordCount=None):
    """
    Read in our data from a CSV file and create a dictionary of records, 
    where the key is a unique record ID and each value is dict
    """

    data_d = {}
    recordCount = 0
    with open(filename, encoding=encoding) as f:
        reader = csv.DictReader(f, fieldnames=header, delimiter=delimiter, quoting=csv.QUOTE_NONE)
        for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            row_id = int(row[keyfield])
            data_d[row_id] = dict(clean_row)
            recordCount += 1
            if (maxRecordCount and maxRecordCount == recordCount):
                return data_d

    return data_d

In [58]:
# ## Import training data
filename = r'D:/Geonames/geonames_modifications.tsv'
header = ['id', 'geonameid','name','asciiname','alternatenames','latitude','longitude','feature class','feature code','country code','cc2','admin1 code','admin2 code','admin3 code','admin4 code','population','elevation','dem','timezone','modification date']
keyfield = 'id'
maxRecordCount = 100
trainingData = readData(filename, 'utf-8', '\t', header, keyfield, maxRecordCount)

In [61]:
# ## Training
fields = [
    { 'field':'name', 'type':'String' },
    { 'field':'asciiname', 'type':'String' },
    { 'field':'latitude', 'type':'String' },
    { 'field':'longitude', 'type':'String' },
    { 'field':'country code', 'type':'Exact', 'has missing':True }
]
commonField = 'geonameid'

# Create labeled data
trainingSize = int(0.8*len(trainingData))
labeledData = dedupe.trainingDataDedupe(trainingData, commonField, trainingSize)
print(labeledData)

# Create the matcher
sampleSize = int(0.2*len(trainingData))
matcher = dedupe.Dedupe(fields)
matcher.sample(trainingData, sampleSize)
matcher.markPairs(labeledData)
matcher.train()

# When finished, save our training to disk
trainingFile = r'D:/Geonames/geonames_modifications_training.json'
with open(trainingFile, 'w') as tf:
    matcher.writeTraining(tf)
    
# Save our weights and predicates to disk. If the settings file
# exists, we will skip all the training and learning next time we run
# this file.
settingsFile = r'D:/Geonames/geonames_modifications.settings'
with open(settingsFile, 'wb') as sf:
    matcher.writeSettings(sf)
    
matcher.cleanupTraining()
del matcher

{'match': [], 'distinct': [({'id': '7847566', 'geonameid': '7326366', 'name': 'Ankar Nala', 'asciiname': 'Ankar Nala', 'alternatenames': None, 'latitude': '32.803450000', 'longitude': '72.261370000', 'feature class': 'H ', 'feature code': 'STMI', 'country code': 'PK', 'cc2': None, 'admin1 code': '04', 'admin2 code': None, 'admin3 code': None, 'admin4 code': None, 'population': '0', 'elevation': None, 'dem': '560', 'timezone': 'Asia/Karachi', 'modification date': '2018-04-07'}, {'id': '7847403', 'geonameid': '7325440', 'name': 'Chāh Māchhiwālā', 'asciiname': 'Chah Machhiwala', 'alternatenames': None, 'latitude': '31.552400000', 'longitude': '72.395210000', 'feature class': 'P ', 'feature code': 'PPL', 'country code': 'PK', 'cc2': None, 'admin1 code': '04', 'admin2 code': None, 'admin3 code': None, 'admin4 code': None, 'population': '0', 'elevation': None, 'dem': '164', 'timezone': 'Asia/Karachi', 'modification date': '2010-08-06'}), ({'id': '7847553', 'geonameid': '7326291', 'name': 'Us

TypeError: descriptor 'union' of 'set' object needs an argument

In [None]:
# ## Import real data
filename = r'D:/Geonames/cities1000.txt'
header = ['geonameid','name','asciiname','alternatenames','latitude','longitude','feature class','feature code','country code','cc2','admin1 code','admin2 code','admin3 code','admin4 code','population','elevation','dem','timezone','modification date']
keyfield = 'geonameid'
geonames = readData(filename, 'utf-8', '\t', header, keyfield)

# Create the matcher from the settings file
with open(settingsFile, 'rb') as f:
    matcher = dedupe.StaticDedupe(f)
    threshold = matcher.threshold(geonames)
    matches = matcher.match(geonames, threshold)
    print('%s duplicates found.' % len(matches))

    del matcher

def printMatches(matches):
    for (clusterId, cluster) in enumerate(matches):
        ids, scored = cluster
        print(clusterId)
        for id in ids:
            print (geonames[id]['name'])