In [None]:
import pandas as pd
import geopandas as gpd
import dedupe
import re
import os

In [None]:
# Residential Land Use Codes from MA Dept of Revenue
# https://www.mass.gov/files/documents/2016/08/wr/classificationcodebook.pdf
# Codes are 101*-109*, 031*, and 013*
# Often include suffixes (letters, zeroes or no character), thus regex *?
USE_CODES = '^1[0-1][1-9]*?|^013*?|^031*?'
def read_res(file_list, uses = USE_CODES):
    df = pd.DataFrame()
    for file in file_list:
        df = df.append(gpd.read_file(file), ignore_index=True)
    df = df[df['USE_CODE'].str.contains(uses, regex=True)]
    return df

In [None]:
# Data from MassGIS Standardized Assessor's Parcels
# https://docs.digital.mass.gov/dataset/massgis-data-standardized-assessors-parcels
# Medford, Cambridge, and Somerville all last updated FY 2019
files = ['data/som_assess.dbf', 'data/cam_assess.dbf', 'data/med_assess.dbf']
df = read_res(files)
df_dict = 

In [None]:
replace_list = ['FAMILY', 'IRREVOCABLE', 'NOMINEE', 'REVOCABLE', 
                'REALTY', 'REAL ESTATE', 'TRUSTEES OF', 'TRUSTEE OF', 
                'TRUSTEE', 'TRST', 'TRUST', 'LTD', 'LLC', 'HOLDINGS', 'REALTORS', 'LIMITED PARTNERSHIP', 
                'FOR LIFE', 'LIFE ESTATE', 'ESTATE OF', 'ESTATE', 'TR.']
def clean(c):
    c = c.replace('|'.join(map(re.escape, replace_list)), '', regex=True)
    return c

df['OWN_NAME_CL'] = clean(df['OWNER1'])
df.head()
df_dict = df.to_dict('index')

In [None]:
settings_file = 'training/learned_settings'
training_file = 'training/training.json'

if os.path.exists(settings_file):
    print('Reading learned settings from', settings_file)
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f)
else:
    fields = [
        {'field': 'OWN_NAME_CL', 'type': 'String'},
        {'field': 'OWN_ADDR', 'type': 'String'},
        {'field': 'OWN_CITY', 'type': 'String'},
        {'field': 'OWN_STATE', 'type': 'String'}
        ]
    deduper = dedupe.Dedupe(fields)
    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.prepare_training(df_dict, f)
    else:
        deduper.prepare_training(df_dict)
    print('Starting active labeling...')
    dedupe.console_label(deduper)
    deduper.train()
    with open(training_file, 'w') as tf:
        deduper.write_training(tf)
    with open(settings_file, 'wb') as sf:
        deduper.write_settings(sf)

print('Clustering...')
clustered_dupes = deduper.partition(df_dict, 0.5)
print('Number of sets', len(clustered_dupes))

In [50]:
rid = []
clst = []
conf = []
count = []
for cluster_id, (records, scores) in enumerate(clustered_dupes):
    for record_id, score in zip(records, scores):
        count.append(len(records))
        rid.append(record_id)
        clst.append(cluster_id)
        conf.append(score)
        
clust = pd.DataFrame(list(zip(clst, conf, count)), 
                  columns =['CLST', 'CONF', 'COUNT'],
                  index = rid
                 )
df = df.join(clust)

In [51]:
df.set_index('LOC_ID')
df.to_csv('outputs/parcels_clustered.csv')