In [1]:
import pandas as pd
import geopandas as gpd
from sqlalchemy import create_engine
import numpy as np
import dedupe
import re
from dotenv import load_dotenv
import os

In [None]:
PG_CONNECT = os.getenv("PG_CONNECT")

In [2]:
# Residential Land Use Codes from MA Dept of Revenue
# https://www.mass.gov/files/documents/2016/08/wr/classificationcodebook.pdf
# Codes are 101*-109*, 031*, and 013*
# Often include suffixes (letters, zeroes or no character), thus regex *?
USE_CODES = '^1[0-1][1-9]*?|^013*?|^031*?'
BOS_CODES = '^R[1-4]$|^RC$|^RL$|^CD$|^A$'
def read_res(file_dict):
    df = pd.DataFrame()
    for town, file, in file_dict.items():
        town_df = gpd.read_file(file).drop('geometry', axis='columns')
        town_df['town'] = town
        df = df.append(town_df, ignore_index=True)
    return df

In [3]:
# Data from MassGIS Standardized Assessor's Parcels
# https://docs.digital.mass.gov/dataset/massgis-data-standardized-assessors-parcels
# Medford, Cambridge, and Somerville all last updated FY 2019

In [4]:
# Somerville, Medford processing.
files = {'som': './data/assess/som_assess.dbf', 
          'med': './data/assess/med_assess.dbf'}
som_med = read_res(files)
# Rename column to lower-case.
som_med.columns = som_med.columns.str.lower()
# Filter for residential parcels.
som_med = som_med[som_med['use_code'].str.contains(USE_CODES, regex=True)]
# Identify rows with co-owner names erroneously listed in address column.
mask = som_med.own_addr.str.contains(pat = '|'.join(['^C/O', '^[A-Za-z]']), na=False) & ~som_med.own_addr.str.contains(pat = '|'.join(['^PO', '^P.O.', '^P. O.', '^P O ', '^ONE', '^BOX', '^ZERO']), na=False)
# Add co-owners identified to co column.
som_med['co'] = som_med.own_addr[mask]
som_med.loc[~mask, 'co'] = None
# Fill own_addr with none for above-identified rows.
som_med.loc[mask, 'own_addr'] = None
# Remame columns
som_med = som_med.rename(columns = {
    'prop_id': 'gisid',
    'owner1': 'own',
    'site_addr': 'prop_addr'
})
som_med.loc[:,'prop_addr'] = som_med.prop_addr.str.strip()
# Replace underscores with hyphens.
som_med.loc[:,'gisid'] = som_med.gisid.str.replace(r'_', '-', regex=True)
# Concatenate address.
som_med.loc[:,'own_addr'] = [', '.join((str(a), str(b), str(c))) for a, b, c in zip(som_med['own_addr'], som_med['own_city'], som_med['own_state'])]
som_med.loc[:,'own_addr'] = [' '.join((str(a), str(b))) for a, b in zip(som_med['own_addr'], som_med['own_zip'])]
# Remove concatenated Nones.
som_med = som_med.replace({r'None, ': ''}, regex=True)
# Filter columns.
som_med = som_med[['gisid', 'town', 'prop_addr', 'own', 'co', 'own_addr']]

In [5]:
bos = pd.read_csv('./data/assess/bos_assess.csv', dtype={'GIS_ID': str, 'MAIL_ZIPCODE': str, 'U_TOT_RMS': str})
bos.columns = bos.columns.str.lower()
bos = bos.rename(columns = {
    'gis_id': 'gisid',
    'owner': 'own',
    'mail_addressee': 'co'
})
bos['town'] = 'bos'
# Filter by residential property types.
bos = bos[bos['lu'].str.contains(BOS_CODES, regex=True)]
bos.loc[:, 'gisid'] = bos.gisid.str.strip().str.pad(width=10, side='left', fillchar='0')
# Pad ZIP code with zeroes, remove 4-digit suffix.
bos.loc[:,'mail_zipcode'] = bos.mail_zipcode.astype(str).str.strip().str.pad(width=5, side='left', fillchar='0')
# Add comma between city and state.
bos.loc[:,'mail cs'] = bos['mail cs'].str.rsplit(' ', 1).apply(lambda x: ', '.join(x))
# Concatenate property address components
bos.loc[:,'prop_addr'] = [' '.join((str(a), str(b), str(c))) for a, b, c in zip(bos['st_num'], bos['st_name'], bos['st_name_suf'])]
bos.loc[:,'prop_addr'] = [' #'.join((str(a), str(b))) for a, b in zip(bos['prop_addr'], bos['unit_num'])]
bos.loc[:,'prop_addr'] = bos.prop_addr.str.strip()
# Concatenate owner address components.
bos.loc[:,'own_addr'] = [', '.join((str(a), str(b))) for a, b in zip(bos['mail_address'], bos['mail cs'])]
bos.loc[:,'own_addr'] = [' '.join((str(a), str(b))) for a, b in zip(bos['own_addr'], bos['mail_zipcode'])]
bos.loc[:,'own_addr'] = bos.own_addr.str.strip()
# Filter columns
bos = bos[['gisid', 'town', 'prop_addr', 'own', 'co', 'own_addr']]
# Replace blank strings with None (necessary for dedupe).
bos = bos.replace({' ': None, '': None, r' #nan': None})
bos = bos.replace({r' #nan': ''}, regex=True)

In [6]:
cam = pd.read_csv('./data/assess/cam_assess.csv', dtype=str)
# cam = cam.astype(str).replace(r'nan', '', regex=True)
# rename all columns to lowercase
cam.columns = cam.columns.str.lower()
# Filter for residential properties.
cam = cam[cam['stateclasscode'].str.contains(USE_CODES, regex=True)]
# Pad zip to five digits and remove 4-digit zip suffix.
cam.loc[:,'owner_zip'] = cam['owner_zip'].str.rsplit('-', 1).str[0]
# Identify rows with co-owner names erroneously listed in address column.
mask = cam.owner_address.str.contains(pat = '|'.join(['^C/O', '^ATTN:']), na=False)
cam.loc[mask, 'owner_address'] = None
# Add co-owners identified to co column.
cam.loc[mask, 'owner_coownername'] = [', '.join((str(a), str(b)))  for a, b in zip(cam.loc[mask, 'owner_coownername'], cam.loc[mask, 'owner_address'])]
# Concatenate owner address components
cam.loc[:,'own_addr'] = [', '.join((str(a), str(b), str(c), str(d))) for a, b, c, d in zip(cam['owner_address'], cam['owner_address2'], cam['owner_city'], cam['owner_state'])]
cam.loc[:,'own_addr'] = [' '.join((str(a), str(b))) for a, b in zip(cam['own_addr'], cam['owner_zip'])]
cam.loc[:,'own_addr'] = cam.own_addr.str.strip()
# Clean property address column
cam['prop_addr'] = cam['address'].str.rsplit('(', 1).apply(lambda x: x[0].replace('\n', ' ').strip())
cam['town'] = 'cam' 
cam = cam.rename(columns = {
    'owner_name': 'own',
    'owner_coownername': 'co'
})
cam = cam.replace({r'^, ': '', r' ,': '', r', nan': '', r'None, ': '', r', None': ''}, regex=True)
cam = cam.replace({' ': None, '': None, np.nan: None})
cam = cam[['gisid', 'town', 'prop_addr', 'own', 'co', 'own_addr']]

In [7]:
brook = pd.read_csv('./data/assess/brook_assess.csv', dtype=str)
brook.columns = brook.columns.str.lower()
brook = brook[brook['usecd'].str.contains(USE_CODES, regex=True)]

brook.loc[:,'zip'] = brook['zip'].str.rsplit('-', 1).str[0]
# Name town.
brook['town'] = 'brk' 
# Concatenate address.
brook.loc[:,'own_addr'] = [', '.join((str(a), str(b), str(c))) for a, b, c, in zip(brook['address'], brook['city'], brook['state'])]
# Append zip to address with no comma.
brook.loc[:,'own_addr'] = [' '.join((str(a), str(b))) for a, b in zip(brook['own_addr'], brook['zip'])]
brook.loc[:,'own_addr'] = brook.own_addr.str.strip()
# Concatenate property address components
brook.loc[:,'prop_addr'] = [''.join((str(a), str(b))) for a, b in zip(brook['addno1'], brook['addno2'])]
brook.loc[:,'prop_addr'] = [' '.join((str(a), str(b))) for a, b in zip(brook['prop_addr'], brook['addst1'])]
brook.loc[:,'prop_addr'] = [' '.join((str(a), str(b))) for a, b in zip(brook['prop_addr'], brook['addst2'])]
brook.loc[:,'prop_addr'] = brook.prop_addr.str.strip()
# Append 
brook.loc[:,'own'] = [' '.join((str(a), str(b))) for a, b in zip(brook['firstname1'], brook['lastname1'])]
brook.loc[:,'co'] = [' '.join((str(a), str(b))) for a, b in zip(brook['firstname2'], brook['lastname2'])]
brook = brook.replace({' ': None, '': None})
brook = brook.rename(columns = {
    'parcel-id': 'gisid'
})
brook = brook.replace({r'^, ': '', r' ,': '', r', nan': '', r'nan': '', r'None, ': '', r', None': ''}, regex=True)
brook = brook.replace({' ': None, '': None})
brook = brook[['gisid', 'town', 'prop_addr', 'own', 'co', 'own_addr']]

In [8]:
all_assess = pd.concat([som_med, cam, bos, brook], ignore_index=True)
all_assess.loc[:,'prop_addr'] = all_assess.prop_addr.str.lstrip('0').str.strip()
all_assess.loc[:,'own_addr'] = all_assess.own_addr.str.lstrip('0').str.strip()
all_assess = all_assess.replace({r'None': '', 'nan': ''}, regex=True)
all_assess = all_assess.replace({' ': None, '': None})
all_assess = all_assess[~all_assess.own_addr.str.contains('|'.join(['^BOSTON, MA', '^MEDFORD, MA', '^SOMERVILLE, MA']), regex=True, na=False)]
all_assess = all_assess[~pd.isnull(all_assess['gisid'])]

In [9]:
# Convert to dictionary (expected by Dedupe)
all_assess_dict = all_assess.to_dict('index')

In [10]:
settings_file = './training/learned_settings'
training_file = './training/training.json'

# If settings exist, read from existing.
if os.path.exists(settings_file):
    print('Reading learned settings from', settings_file)
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f)
else:
    # Tell Dedupe which fields are used to identify duplicates.
    fields = [
        {'field': 'own', 'variable name': 'own', 'type': 'Name'},
        {'field': 'co', 'variable name': 'co', 'type': 'Name'},
        {'field': 'own_addr', 'variable name': 'own_addr', 'type': 'Address'},
        {'type': 'Interaction', 'interaction variables': ['own', 'co']}
        ]
    deduper = dedupe.Dedupe(fields)
    # If training file exists, read it...
    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.prepare_training(all_assess_dict, f)
    # Otherwise, prepare a training set...
    else:
        deduper.prepare_training(all_assess_dict)
    # Start supervised labeling.
    dedupe.console_label(deduper)
    deduper.train()
    # Write settings and training sets.
    with open(training_file, 'w') as tf:
        deduper.write_training(tf)
    with open(settings_file, 'wb') as sf:
        deduper.write_settings(sf)

INFO:dedupe.api:((SimplePredicate: (oneGramFingerprint, own_addr), SimplePredicate: (sameThreeCharStartPredicate, own_addr)), (PartialPredicate: (commonTwoTokens, own, CorporationName), PartialPredicate: (twoGramFingerprint, own, CorporationName)), (PartialPredicate: (commonTwoTokens, co, CorporationName), SimplePredicate: (commonThreeTokens, own_addr)), (PartialPredicate: (commonFourGram, own, Surname), SimplePredicate: (twoGramFingerprint, own)))


Reading learned settings from ./training/learned_settings


In [11]:
# Identify clusters based on training dataset.
# Higher threshold is less tolerant of differences between names/addresses.
clustered_dupes = deduper.partition(all_assess_dict, threshold = 0.7)

# How many sets are there?
print('Number of sets', len(clustered_dupes))

INFO:dedupe.blocking:10000, 3.9768902 seconds
INFO:dedupe.blocking:20000, 7.8477522 seconds
INFO:dedupe.blocking:30000, 11.4433792 seconds
INFO:dedupe.blocking:40000, 16.3532982 seconds
INFO:dedupe.blocking:50000, 21.4108232 seconds
INFO:dedupe.blocking:60000, 25.9323602 seconds
INFO:dedupe.blocking:70000, 29.8725542 seconds
INFO:dedupe.blocking:80000, 34.1600332 seconds
INFO:dedupe.blocking:90000, 38.4827952 seconds
INFO:dedupe.blocking:100000, 42.6273742 seconds
INFO:dedupe.blocking:110000, 46.5384842 seconds
INFO:dedupe.blocking:120000, 50.2511372 seconds
INFO:dedupe.blocking:130000, 53.7834232 seconds
INFO:dedupe.blocking:140000, 57.4280022 seconds
INFO:dedupe.blocking:150000, 61.1980012 seconds
INFO:dedupe.blocking:160000, 65.0863892 seconds
INFO:dedupe.blocking:170000, 69.1169082 seconds
INFO:dedupe.blocking:180000, 73.1619042 seconds
INFO:dedupe.blocking:190000, 76.8786992 seconds
INFO:dedupe.blocking:200000, 81.0180292 seconds
INFO:dedupe.blocking:210000, 85.3040122 seconds


Number of sets 174932


In [12]:
# Create empty arrays to hold results.
rid = []
clst = []
conf = []
count = []

# Iterate over results...
for cluster_id, (records, scores) in enumerate(clustered_dupes):
    for record_id, score in zip(records, scores):
        # Append record id
        # Corresponds to index of assessor dataframe.
        rid.append(record_id)
        # Append cluster ID.
        clst.append(cluster_id)
        # Append confidence score.
        conf.append(score)

# Build new dataframe using result arrays.
clust = pd.DataFrame(list(zip(clst, conf)), 
                  columns =['clst', 'conf'],
                  index = rid
                 )

In [13]:
# Join clusters to assessors dataframe.
all_assess = all_assess.join(clust)

In [14]:
# Read spatial data
parcels_gdf = gpd.read_file('./data/parcels/mamas_parcels.shp')
parcels_gdf = parcels_gdf.rename(columns = {
    'pid': 'gisid'
}).drop_duplicates(subset=['gisid', 'town'])
parcels_gdf = parcels_gdf[~pd.isnull(parcels_gdf['gisid'])]
parcels_gdf = parcels_gdf[~pd.isnull(parcels_gdf['geometry'])]
parcels_gdf.loc[:,'geometry'] = parcels_gdf.geometry.centroid
parcels_gdf.loc[:,'lat'] = parcels_gdf.geometry.y
parcels_gdf.loc[:,'lon'] = parcels_gdf.geometry.x


  parcels_gdf.loc[:,'geometry'] = parcels_gdf.geometry.centroid


In [15]:
all_assess = parcels_gdf.merge(all_assess, on=['town', 'gisid'], how='right')
all_assess = all_assess[~np.isnan(all_assess.lat)]

In [17]:
pg_engine = create_engine(PG_CONNECT)
all_assess.to_postgis("props", con=pg_engine, schema='public', if_exists='fail', index=True, index_label='id')