In [1]:
from future.builtins import next
import os
import csv
import re
import logging

import dedupe
from unidecode import unidecode

In [2]:
# ## Logging
logging.getLogger().setLevel(logging.WARN)

In [3]:
def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """
    # If data is missing, indicate that by setting the value to `None`
    if not column:
        column = None
    return column
    
    try : # python 2/3 string differences
        column = column.decode('utf8')
    except AttributeError:
        pass
    column = unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    # If data is missing, indicate that by setting the value to `None`
    if not column:
        column = None
    return column

def readData(filename, encoding, delimiter, header, keyfield, maxRecordCount=None):
    """
    Read in our data from a CSV file and create a dictionary of records, 
    where the key is a unique record ID and each value is dict
    If no latitude or longitude exists the record is thrown away.
    """

    data_d = {}
    recordCount = 0
    with open(filename, encoding=encoding) as f:
        reader = csv.DictReader(f, fieldnames=header, delimiter=delimiter, quoting=csv.QUOTE_NONE)
        for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            latitude = None
            longitude = None
            for (k, v) in clean_row:
                if 'latitude' == k:
                    latitude = float(v)
                if 'longitude' == k:
                    longitude = float(v)
            if latitude and longitude:
                clean_row.append(('geometry', (latitude, longitude)))
                row_id = int(row[keyfield])
                data_d[row_id] = dict(clean_row)
                recordCount += 1
                if (maxRecordCount and maxRecordCount == recordCount):
                    return data_d

    return data_d

In [4]:
# ## Import data
filename = r'D:/Geonames/cities1000.txt'
header = ['geonameid','name','asciiname','alternatenames','latitude','longitude','feature class','feature code','country code','cc2','admin1 code','admin2 code','admin3 code','admin4 code','population','elevation','dem','timezone','modification date']
keyfield = 'geonameid'
maxRecordCount = None
data = readData(filename, 'utf-8', '\t', header, keyfield, maxRecordCount)

In [5]:
# ## Read settings
settingsFile = r'D:/Geonames/train.settings'
with open(settingsFile, 'rb') as f:
    matcher = dedupe.StaticDedupe(f)

In [6]:
# ## Find best threshold and do the clustering
# ## and write the clusters to a file
threshold = matcher.threshold(data)
matches = matcher.match(data, threshold)
matchFile = r'D:/Geonames/matches.txt'
with open(matchFile, 'w', encoding='utf-8') as f:
    writer = csv.writer(f, lineterminator='\n')
    for (clusterId, cluster) in enumerate(matches):
        ids, scored = cluster
        writer.writerow([clusterId])
        for id in ids:
            writer.writerow(data[id].values())

