In [None]:
import pandas as pd
import geopandas as gpd
import dedupe
import re
import os

In [212]:
# Residential Land Use Codes from MA Dept of Revenue
# https://www.mass.gov/files/documents/2016/08/wr/classificationcodebook.pdf
# Codes are 101*-109*, 031*, and 013*
# Often include suffixes (letters, zeroes or no character), thus regex *?
USE_CODES = '^1[0-1][1-9]*?|^013*?|^031*?'
def read_res(file_list, uses = USE_CODES):
    df = pd.DataFrame()
    for file in file_list:
        df = df.append(gpd.read_file(file), ignore_index=True)
    df = df[df['USE_CODE'].str.contains(uses, regex=True)]
    return df

In [252]:
# Data from MassGIS Standardized Assessor's Parcels
# https://docs.digital.mass.gov/dataset/massgis-data-standardized-assessors-parcels
# Medford, Cambridge, and Somerville all last updated FY 2019
files = ['data/som_assess.dbf', 'data/cam_assess.dbf', 'data/med_assess.dbf']
df = read_res(files)

In [253]:
# Clean owner names!
# TODO: This doesn't seem to work in order, leading to 'EE', 'EES', etc.
# Replace in order!
replace_list = ['FAMILY', 'IRREVOCABLE', 'NOMINEE', 'REVOCABLE', 
                'REALTY', 'REAL ESTATE', 'TRUSTEES OF', 'TRUSTEE OF', 'TR.' 
                'TRUSTEE', 'TRST', 'TRUST', 'LTD', 'LLC', 'HOLDINGS', 'REALTORS', 'LIMITED PARTNERSHIP', 
                'FOR LIFE', 'LIFE ESTATE OF', 'ESTATE OF', 'ESTATE']
def clean(c):
    c = c.replace('|'.join(map(re.escape, replace_list)), '', regex=True)
    return c

df['OWN_NAME_CL'] = clean(df['OWNER1'])
# Create full concatenated address column.
df['ADDR_FULL'] = [', '.join(filter(None, (str(a), str(b), str(c), d))) for a, b, c, d in zip(df['OWN_ADDR'], df['OWN_CITY'], df['OWN_STATE'], df['OWN_CO'])]
# Set 'LOC_ID' as the index - unique identifier.
df.set_index('LOC_ID')
# Drop (empty) geometry column.
df.drop('geometry', inplace=True, axis=1)
# Convert to dictionary (expected by Dedupe)
df_dict = df.to_dict('index')

In [259]:
settings_file = 'training/learned_settings'
training_file = 'training/training.json'

# If settings exist, read from existing.
if os.path.exists(settings_file):
    print('Reading learned settings from', settings_file)
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f)
else:
    # Tell Dedupe which fields are used to identify duplicates.
    fields = [
        {'field': 'OWN_NAME_CL', 'type': 'String'},
        {'field': 'OWN_ADDR', 'type': 'String'},
        {'field': 'OWN_CITY', 'type': 'String'},
        {'field': 'OWN_STATE', 'type': 'String'}
        ]
    deduper = dedupe.Dedupe(fields)
    # If training file exists, read it...
    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.prepare_training(df_dict, f)
    # Otherwise, prepare a training set...
    else:
        deduper.prepare_training(df_dict)
    # Start supervised labeling.
    dedupe.console_label(deduper)
    deduper.train()
    # Write settings and training sets.
    with open(training_file, 'w') as tf:
        deduper.write_training(tf)
    with open(settings_file, 'wb') as sf:
        deduper.write_settings(sf)

# Identify clusters based on training dataset.
# Higher threshold is less tolerant of differences between names/addresses.
clustered_dupes = deduper.partition(df_dict, threshold = 0.5)

# How many sets are there?
print('Number of sets', len(clustered_dupes))

INFO:dedupe.api:((SimplePredicate: (sameSevenCharStartPredicate, OWN_NAME_CL), SimplePredicate: (sameThreeCharStartPredicate, OWN_CITY)), (SimplePredicate: (fingerprint, OWN_ADDR), SimplePredicate: (oneGramFingerprint, OWN_STATE)))
INFO:dedupe.blocking:10000, 0.1831782 seconds


Reading learned settings from training/learned_settings


INFO:dedupe.blocking:20000, 0.3771302 seconds
INFO:dedupe.blocking:30000, 0.5649152 seconds
INFO:dedupe.blocking:40000, 0.7501852 seconds
INFO:dedupe.blocking:50000, 0.9332142 seconds


Number of sets 47615


In [255]:
# Create empty arrays to hold results.
rid = []
clst = []
conf = []
count = []

# Iterate over results...
for cluster_id, (records, scores) in enumerate(clustered_dupes):
    for record_id, score in zip(records, scores):
        # How many properties does individual own?
        count.append(len(records))
        # Append record id
        # Corresponds to index of assessor dataframe.
        rid.append(record_id)
        # Append cluster ID.
        clst.append(cluster_id)
        # Append confidence score.
        conf.append(score)

# Build new dataframe using result arrays.
clust = pd.DataFrame(list(zip(clst, conf, count)), 
                  columns =['CLST', 'CONF', 'COUNT'],
                  index = rid
                 )

# Join clusters to assessors dataframe.
df = df.join(clust)

# Write assessor's dataframe to CSV.
df.to_csv('outputs/parcels_clustered.csv')

In [257]:
# Dissolve clusters into owners list.
owners = df.sort_values('CONF').groupby('CLST').agg(
    # Owner with highest confidence.
    own = ('OWNER1', 'first'),
    # List of all unique owners.
    own_list = ('OWNER1', 'unique'),
    # Address with highest confidence
    add = ('ADDR_FULL', 'first'),
    # List of all unique addresses.
    add_list = ('ADDR_FULL', 'unique')
)

# Write owners to CSV.
owners.to_csv('outputs/owners.csv')

In [258]:
# Read spatial data. Unified parcel data available from MassGIS
# https://docs.digital.mass.gov/dataset/massgis-data-standardized-assessors-parcels
parcels_gdf = gpd.read_file('data/mamas_parcels.shp')

# Merge parcels and dataframe on 'LOC_ID' column
# Right join because we want to duplicate parcels with multiple owners
# E.g., condos, multiply-owned triple-deckers...
parcels_joined = parcels_gdf.merge(df, on='LOC_ID', how='right')

# Remove records without geometries.
parcels_joined = parcels_joined[parcels_joined['geometry'] != None]

# Dissolve polygons to multi-polygons on cluster.
parcels_multi = parcels_joined[['CLST', 'geometry', 'COUNT']].dissolve(by='CLST')

# Write parcel multi-polygons to file.
parcels_multi.to_file("parcels_multi.shp")