In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import dedupe
import re
import os

In [2]:
# Residential Land Use Codes from MA Dept of Revenue
# https://www.mass.gov/files/documents/2016/08/wr/classificationcodebook.pdf
# Codes are 101*-109*, 031*, and 013*
# Often include suffixes (letters, zeroes or no character), thus regex *?
USE_CODES = '^1[0-1][1-9]*?|^013*?|^031*?'
BOS_CODES = '^R[1-4]$|^RC$|^RL$|^CD$|^A$'
def read_res(file_dict):
    df = pd.DataFrame()
    for town, file, in file_dict.items():
        town_df = gpd.read_file(file).drop('geometry', axis='columns')
        town_df['town'] = town
        df = df.append(town_df, ignore_index=True)
    return df

In [3]:
# Data from MassGIS Standardized Assessor's Parcels
# https://docs.digital.mass.gov/dataset/massgis-data-standardized-assessors-parcels
# Medford, Cambridge, and Somerville all last updated FY 2019

In [4]:
# Somerville, Medford processing.

files = {'som': './data/assess/som_assess.dbf', 
          'med': './data/assess/med_assess.dbf'}
som_med = read_res(files)
# Rename column to lower-case.
som_med.columns = som_med.columns.str.lower()
# Filter for residential parcels.
som_med = som_med[som_med['use_code'].str.contains(USE_CODES, regex=True)]
# Identify rows with co-owner names erroneously listed in address column.
mask = som_med.own_addr.str.contains(pat = '|'.join(['^C/O', '^[A-Za-z]']), na=False) & ~som_med.own_addr.str.contains(pat = '|'.join(['^PO', '^P.O.', '^P. O.', '^ONE', '^BOX', '^ZERO']), na=False)
# Add co-owners identified to co column.
som_med['co'] = som_med.own_addr[mask]
som_med.loc[~mask, 'co'] = None
# Fill own_addr with none for above-identified rows.
som_med.loc[mask, 'own_addr'] = None
# Remame columns
som_med = som_med.rename(columns = {
    'prop_id': 'gisid',
    'owner1': 'own',
    'site_addr': 'prop_addr'
})
# Replace underscores with hyphens.
som_med.loc[:,'gisid'] = som_med.gisid.str.replace(r'_', '-', regex=True)
# Concatenate address.
som_med.loc[:,'own_addr'] = [', '.join((str(a), str(b), str(c))) for a, b, c in zip(som_med['own_addr'], som_med['own_city'], som_med['own_state'])]
som_med.loc[:,'own_addr'] = [' '.join((str(a), str(b))) for a, b in zip(som_med['own_addr'], som_med['own_zip'])]
# Remove concatenated Nones.
som_med = som_med.replace({r'None, ': ''}, regex=True)
# Filter columns.
som_med = som_med[['gisid', 'town', 'prop_addr', 'own', 'co', 'own_addr']]

In [5]:
bos = pd.read_csv('./data/assess/bos_assess.csv', dtype={'GIS_ID': str, 'MAIL_ZIPCODE': str, 'U_TOT_RMS': str})
bos.columns = bos.columns.str.lower()
bos = bos.rename(columns = {
    'gis_id': 'gisid',
    'owner': 'own',
    'mail_addressee': 'co'
})
bos['town'] = 'bos'
# Filter by residential property types.
bos = bos[bos['lu'].str.contains(BOS_CODES, regex=True)]
bos.loc[:, 'gisid'] = bos.gisid.str.strip().str.pad(width=10, side='left', fillchar='0')
# Pad ZIP code with zeroes, remove 4-digit suffix.
bos.loc[:,'mail_zipcode'] = bos.mail_zipcode.astype(str).str.strip().str.pad(width=5, side='left', fillchar='0')
# Add comma between city and state.
bos.loc[:,'mail cs'] = bos['mail cs'].str.rsplit(' ', 1).apply(lambda x: ', '.join(x))
# Concatenate property address components
bos.loc[:,'prop_addr'] = [' '.join((str(a), str(b), str(c))) for a, b, c in zip(bos['st_num'], bos['st_name'], bos['st_name_suf'])]
bos.loc[:,'prop_addr'] = [' #'.join((str(a), str(b))) for a, b in zip(bos['prop_addr'], bos['unit_num'])]
# Concatenate owner address components.
bos.loc[:,'own_addr'] = [', '.join((str(a), str(b))) for a, b in zip(bos['mail_address'], bos['mail cs'])]
bos.loc[:,'own_addr'] = [' '.join((str(a), str(b))) for a, b in zip(bos['own_addr'], bos['mail_zipcode'])]
# Filter columns
bos = bos[['gisid', 'town', 'prop_addr', 'own', 'co', 'own_addr']]
# Replace blank strings with None (necessary for dedupe).
bos = bos.replace({' ': None, '': None, r' #nan': None})
bos = bos.replace({r' #nan': ''}, regex=True)

In [6]:
cam = pd.read_csv('./data/assess/cam_assess.csv', dtype=str)
# cam = cam.astype(str).replace(r'nan', '', regex=True)
# rename all columns to lowercase
cam.columns = cam.columns.str.lower()
# Filter for residential properties.
cam = cam[cam['stateclasscode'].str.contains(USE_CODES, regex=True)]
# Pad zip to five digits and remove 4-digit zip suffix.
cam.loc[:,'owner_zip'] = cam['owner_zip'].str.rsplit('-', 1).str[0]
# Identify rows with co-owner names erroneously listed in address column.
mask = cam.owner_address.str.contains(pat = '|'.join(['^C/O', '^ATTN:']), na=False)
cam.loc[mask, 'owner_address'] = None
# Add co-owners identified to co column.
cam.loc[mask, 'owner_coownername'] = [', '.join((str(a), str(b)))  for a, b in zip(cam.loc[mask, 'owner_coownername'], cam.loc[mask, 'owner_address'])]
# Concatenate owner address components
cam.loc[:,'own_addr'] = [', '.join((str(a), str(b), str(c), str(d))) for a, b, c, d in zip(cam['owner_address'], cam['owner_address2'], cam['owner_city'], cam['owner_state'])]
cam.loc[:,'own_addr'] = [' '.join((str(a), str(b))) for a, b in zip(cam['own_addr'], cam['owner_zip'])]
# Clean property address column
cam['prop_addr'] = cam['address'].str.rsplit('(', 1).apply(lambda x: x[0].replace('\n', ' ').strip())
cam['town'] = 'cam' 
cam = cam.rename(columns = {
    'owner_name': 'own',
    'owner_coownername': 'co'
})
cam = cam.replace({r'^, ': '', r' ,': '', r', nan': '', r'None, ': '', r', None': ''}, regex=True)
cam = cam.replace({' ': None, '': None, np.nan: None})
cam = cam[['gisid', 'town', 'prop_addr', 'own', 'co', 'own_addr']]

In [9]:
brook = pd.read_csv('./data/assess/brook_assess.csv', dtype=str)
brook.columns = brook.columns.str.lower()
brook = brook[brook['usecd'].str.contains(USE_CODES, regex=True)]

brook.loc[:,'zip'] = brook['zip'].str.rsplit('-', 1).str[0]
# Name town.
brook['town'] = 'brk' 
# Concatenate address.
brook.loc[:,'own_addr'] = [', '.join((str(a), str(b), str(c))) for a, b, c, in zip(brook['address'], brook['city'], brook['state'])]
# Append zip to address with no comma.
brook.loc[:,'own_addr'] = [' '.join((str(a), str(b))) for a, b in zip(brook['own_addr'], brook['zip'])]
# Concatenate property address components
brook.loc[:,'prop_addr'] = [''.join((str(a), str(b))) for a, b in zip(brook['addno1'], brook['addno2'])]
brook.loc[:,'prop_addr'] = [' '.join((str(a), str(b))) for a, b in zip(brook['prop_addr'], brook['addst1'])]
brook.loc[:,'prop_addr'] = [' '.join((str(a), str(b))) for a, b in zip(brook['prop_addr'], brook['addst2'])]
# Append 
brook.loc[:,'own'] = [' '.join((str(a), str(b))) for a, b in zip(brook['firstname1'], brook['lastname1'])]
brook.loc[:,'co'] = [' '.join((str(a), str(b))) for a, b in zip(brook['firstname2'], brook['lastname2'])]
brook = brook.replace({' ': None, '': None})
brook = brook.rename(columns = {
    'parcel-id': 'gisid'
})
brook = brook.replace({r'^, ': '', r' ,': '', r', nan': '', r'nan': '', r'None, ': '', r', None': ''}, regex=True)
brook = brook.replace({' ': None, '': None})
brook = brook[['gisid', 'town', 'prop_addr', 'own', 'co', 'own_addr']]

In [10]:
all_assess = pd.concat([som_med, cam, bos, brook], ignore_index=True)
# Convert to dictionary (expected by Dedupe)
all_assess_dict = all_assess.to_dict('index')

In [11]:
settings_file = './training/learned_settings'
training_file = './training/training.json'

# If settings exist, read from existing.
if os.path.exists(settings_file):
    print('Reading learned settings from', settings_file)
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f)
else:
    # Tell Dedupe which fields are used to identify duplicates.
    fields = [
        {'field': 'own', 'variable name': 'own', 'type': 'Name'},
        {'field': 'co', 'variable name': 'co', 'type': 'Name'},
        {'field': 'own_addr', 'variable name': 'own_addr', 'type': 'Address'},
        {'type': 'Interaction', 'interaction variables': ['own', 'co']}
        ]
    deduper = dedupe.Dedupe(fields)
    # If training file exists, read it...
    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.prepare_training(all_assess_dict, f)
    # Otherwise, prepare a training set...
    else:
        deduper.prepare_training(all_assess_dict)
    # Start supervised labeling.
    dedupe.console_label(deduper)
    deduper.train()
    # Write settings and training sets.
    with open(training_file, 'w') as tf:
        deduper.write_training(tf)
    with open(settings_file, 'wb') as sf:
        deduper.write_settings(sf)

INFO:dedupe.api:((SimplePredicate: (sameSevenCharStartPredicate, own_addr), SimplePredicate: (twoGramFingerprint, own_addr)), (PartialPredicate: (sameFiveCharStartPredicate, co, CorporationName), SimplePredicate: (sortedAcronym, co)), (SimplePredicate: (oneGramFingerprint, own), SimplePredicate: (twoGramFingerprint, own)))


Reading learned settings from ./training/learned_settings


In [12]:
# Identify clusters based on training dataset.
# Higher threshold is less tolerant of differences between names/addresses.
clustered_dupes = deduper.partition(all_assess_dict, threshold = 0.3)

# How many sets are there?
print('Number of sets', len(clustered_dupes))

INFO:dedupe.blocking:10000, 0.2947482 seconds
INFO:dedupe.blocking:20000, 0.6059692 seconds
INFO:dedupe.blocking:30000, 0.9666942 seconds
INFO:dedupe.blocking:40000, 2.1819222 seconds
INFO:dedupe.blocking:50000, 3.5357282 seconds
INFO:dedupe.blocking:60000, 4.8025012 seconds
INFO:dedupe.blocking:70000, 6.0673062 seconds
INFO:dedupe.blocking:80000, 7.6661502 seconds
INFO:dedupe.blocking:90000, 9.3774622 seconds
INFO:dedupe.blocking:100000, 11.1317852 seconds
INFO:dedupe.blocking:110000, 12.5258602 seconds
INFO:dedupe.blocking:120000, 13.7896282 seconds
INFO:dedupe.blocking:130000, 14.9698532 seconds
INFO:dedupe.blocking:140000, 16.0769922 seconds
INFO:dedupe.blocking:150000, 17.0900552 seconds
INFO:dedupe.blocking:160000, 18.1798332 seconds
INFO:dedupe.blocking:170000, 19.3415222 seconds
INFO:dedupe.blocking:180000, 20.6225272 seconds
INFO:dedupe.blocking:190000, 22.0231662 seconds
INFO:dedupe.blocking:200000, 23.2216452 seconds
INFO:dedupe.blocking:210000, 24.2516682 seconds


Number of sets 178679


In [13]:
# Create empty arrays to hold results.
rid = []
clst = []
conf = []
count = []

# Iterate over results...
for cluster_id, (records, scores) in enumerate(clustered_dupes):
    for record_id, score in zip(records, scores):
        # How many properties does individual own?
        count.append(len(records))
        # Append record id
        # Corresponds to index of assessor dataframe.
        rid.append(record_id)
        # Append cluster ID.
        clst.append(cluster_id)
        # Append confidence score.
        conf.append(score)

# Build new dataframe using result arrays.
clust = pd.DataFrame(list(zip(clst, conf, count)), 
                  columns =['clst', 'conf', 'count'],
                  index = rid
                 )

In [14]:
# Join clusters to assessors dataframe.
all_assess = all_assess.join(clust)

In [15]:
# Dissolve clusters into owners list.
owners = all_assess.sort_values('conf').groupby('clst').agg(
    # Owner with highest confidence.
    own = ('own', lambda x: x.iloc[0]),
    # List of all unique owners.
    own_list = ('own', lambda x: list(x.unique())),
    # Owner with highest confidence.
    coown = ('co', lambda x: x.iloc[0]),
    # List of all unique owners.
    coown_list = ('co', lambda x: list(x.unique())),
    # Address with highest confidence
    own_add = ('own_addr', lambda x: x.iloc[0]),
    # List of all unique addresses.
    own_adds = ('own_addr', lambda x: list(x.unique())),
    # List of all unique property addresses.
    prop_list = ('prop_addr', lambda x: list(x.unique())),
)

In [16]:
# Read spatial data
parcels_gdf = gpd.read_file('./data/parcels/mamas_parcels.shp')

In [17]:
# Merge parcels and dataframe on 'LOC_ID' column
# Right join because we want to duplicate parcels with multiple owners
# E.g., condos, multiply-owned triple-deckers...
parcels_joined = parcels_gdf.merge(all_assess, left_on=['town', 'pid'], right_on=['town','gisid'])

# Remove records without geometries.
parcels_joined = parcels_joined[parcels_joined.geometry != None]
parcels_joined.loc[:,'geometry'] = parcels_joined.geometry.centroid
# parcels_joined.to_file("parcels_joined.geojson", driver='GeoJSON')

# Dissolve polygons to multi-polygons on cluster.
parcels_multi = parcels_joined[['clst', 'geometry', 'count', '']].dissolve(by='clst')
parcels_multi = parcels_multi.merge(owners, on='clst')

In [18]:
parcels_multi.head()

Unnamed: 0_level_0,geometry,count,own,own_list,co,co_list,own_add,own_adds,prop_list
clst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4,"MULTIPOINT (-71.12107 42.42081, -71.05426 42.2...",3,SWEENEY JOHN J,[SWEENEY JOHN J],C/O KATHLEEN M SWEENEY,"[C/O KATHLEEN M SWEENEY, None]","14 ASHMONT CT, DORCHESTER, MA 02122","[14 ASHMONT CT, DORCHESTER, MA 02122, 256 HIGH...","[14 ASHMONT CT, 00256 HIGH ST, 03920 MYSTIC VL..."
7,POINT (-71.15726 42.34981),2,DUTA NICOLAE,[DUTA NICOLAE],,[None],"31 FOSTER ST, BRIGHTON, MA 02135","[31 FOSTER ST, BRIGHTON, MA 02135]","[31 FOSTER ST, 03920 MYSTIC VLLY PY U523]"
13,POINT (-71.12679 42.42487),3,JUDY MASTROCOLA REVOCABLE TRUST,"[JUDY MASTROCOLA REVOCABLE TRUST, MASTROCOLA E...",C/O MASTROCOLA MANAGEMENT INC,"[C/O MASTROCOLA MANAGEMENT INC, C/O MASTROCOLA...","W MEDFORD, MA 02156","[W MEDFORD, MA 02156, W. MEDFORD, MA 02156]","[00151 MYSTIC ST, 03920 MYSTIC VLLY PY U504, ..."
14,POINT (-71.13819 42.42232),2,GARRITY DAVID J,"[GARRITY DAVID J, GARRITY DAVID J & KAREN]",,[None],"29 SAGAMORE AVE, MEDFORD, MA 02155","[29 SAGAMORE AVE, MEDFORD, MA 02155]","[00029 SAGAMORE AV, 03920 MYSTIC VLLY PY U507]"
16,"MULTIPOINT (-71.14400 42.43173, -71.14273 42.4...",118,MACISAAC MARY A,"[MACISAAC MARY A, EVOS SOCRATES, GALANTE ALICE...",MACISAAC REALTY TRUST,"[MACISAAC REALTY TRUST, C/O ALICE PAPULLIS, BE...","MEDFORD, MA 02155","[MEDFORD, MA 02155]","[00027 MADISON ST, 00595 WINTHROP ST, 00126 FU..."


In [19]:
# to_file fails to write lists.
with open('test.geojson', 'w') as f:
    f.write(parcels_multi.to_json())