In [None]:
import pandas as pd
import geopandas as gpd

In [None]:
# Residential Land Use Codes from MA Dept of Revenue
# https://www.mass.gov/files/documents/2016/08/wr/classificationcodebook.pdf
# Codes are 101*-109*, 031*, and 013*
# Often include suffixes (letters, zeroes or no character), thus regex *?
USE_CODES = '^1[0-1][1-9]*?|^013*?|^031*?'
def read_res(file_list, uses = USE_CODES):
    df = pd.DataFrame()
    for file in file_list:
        df = df.append(gpd.read_file(file), ignore_index=True)
    df = df[[
        'LOC_ID',
        'OWNER1',
        'OWN_ADDR',
        'OWN_CITY',
        'OWN_STATE',
        'OWN_ZIP',
        'OWN_CO',
        'USE_CODE',
        'CITY',
        'FY']]
    df = df.rename({
        'LOC_ID': 'loc_id',
        'OWNER1': 'own_name', 
        'OWN_ADDR': 'own_add', 
        'OWN_CITY': 'own_city', 
        'OWN_STATE': 'own_state',
        'OWN_ZIP': 'own_zip', 
        'OWN_CO': 'own_country',
        'CITY': 'city',
        'USE_CODE': 'use',
        'FY': 'year'}, 
        axis='columns')
    df = df[df['use'].str.contains(uses, regex=True)]
    return df

In [None]:
# Data from MassGIS Standardized Assessor's Parcels
# https://docs.digital.mass.gov/dataset/massgis-data-standardized-assessors-parcels
# Medford, Cambridge, and Somerville all last updated FY 2019
files = ['data/som_assess.dbf', 'data/cam_assess.dbf', 'data/med_assess.dbf']
df = read_res(files)

In [None]:
import re
replace_list = ['FAMILY', 'IRREVOCABLE', 'NOMINEE', 'REVOCABLE', 
                'REALTY', 'REAL ESTATE', 'TRUSTEES OF', 'TRUSTEE OF', 
                'TRUSTEE', 'TRST', 'TRUST', 'LTD', 'LLC', 'HOLDINGS', 'REALTORS', 'LIMITED PARTNERSHIP', 
                'FOR LIFE', 'ESTATE OF', 'ESTATE', 'TR.']
def clean(c):
    c = c.replace('|'.join(map(re.escape, replace_list)), '', regex=True)
    return c

df['own_name_clean'] = clean(df['own_name'])
df.head()

In [None]:
import dedupe

fields = [
    {'field': 'own_name_clean', 'type': 'String'},
    {'field': 'own_add', 'type': 'String'},
    {'field': 'own_city', 'type': 'String'},
    {'field': 'own_state', 'type': 'String'}
    ]

df_dict = df.to_dict('index')

deduper = dedupe.Dedupe(fields)

deduper.prepare_training(df_dict)

In [None]:
dedupe.console_label(deduper)

In [None]:
deduper.train()
with open('training/training.json', 'w') as tf:
    deduper.write_training(tf)
with open('training/settings', 'wb') as sf:
    deduper.write_settings(sf)

In [None]:
print('Clustering...')
clustered_dupes = deduper.partition(df_dict, 0.5)
print('# duplicate sets', len(clustered_dupes))

In [None]:
rid = []
clst = []
conf = []
count = []
for cluster_id, (records, scores) in enumerate(clustered_dupes):
    for record_id, score in zip(records, scores):
        count.append(len(records))
        rid.append(record_id)
        clst.append(cluster_id)
        conf.append(score)
        
clust = pd.DataFrame(list(zip(clst, conf, count)), 
                  columns =['clst', 'conf', 'count'],
                  index = rid
                 )
df = df.join(clust)

In [None]:
df.set_index('loc_id')
df.to_csv('outputs/parcels_clustered.csv')