In [3]:
import pandas as pd
import geopandas as gpd

In [59]:
# Residential Land Use Codes from MA Dept of Revenue
# https://www.mass.gov/files/documents/2016/08/wr/classificationcodebook.pdf
# Codes are 101*-109*, 031*, and 013*
# Often include suffixes (letters, zeroes or no character), thus regex *?
USE_CODES = '^1[0-1][1-2]*?|^013*?|^031*?'
def read_res(file_list, uses = USE_CODES):
    df = pd.DataFrame()
    for file in file_list:
        df = df.append(gpd.read_file(file), ignore_index=True)
    df = df[[
        'PROP_ID',
        'OWNER1',
        'OWN_ADDR',
        'OWN_CITY',
        'OWN_STATE',
        'OWN_ZIP',
        'OWN_CO',
        'USE_CODE',
        'CITY',
        'FY']]
    df = df.rename({
        'PROP_ID': 'pid',
        'OWNER1': 'own_name', 
        'OWN_ADDR': 'own_add', 
        'OWN_CITY': 'own_city', 
        'OWN_STATE': 'own_state',
        'OWN_ZIP': 'own_zip', 
        'OWN_CO': 'own_country',
        'CITY': 'city',
        'USE_CODE': 'use',
        'FY': 'year'}, 
        axis='columns')
    df = df[df['use'].str.contains(uses, regex=True)]
    return df

In [85]:
# Data from MassGIS Standardized Assessor's Parcels
# https://docs.digital.mass.gov/dataset/massgis-data-standardized-assessors-parcels
# Medford, Cambridge, and Somerville all last updated FY 2019
files = ['som_assess.dbf', 'cam_assess.dbf', 'med_assess.dbf']
df = read_res(files)

In [86]:
import re
from unidecode import unidecode
replace_list = ['FAMILY', 'IRREVOCABLE', 'NOMINEE', 'REVOCABLE', 
                'REALTY', 'REAL ESTATE', 'TRUSTEES OF', 'TRUSTEE OF', 
                'TRUSTEE', 'TRST', 'TRUST', 'LTD',
                'LLC', 'HOLDINGS', 'REALTORS', 'LIMITED PARTNERSHIP', 
                'FOR LIFE', 'ESTATE OF', 'ESTATE', 'TR.']
def clean(c):
    c = c.replace('|'.join(map(re.escape, replace_list)), '', regex=True)
    return c

df['own_name_clean'] = clean(df['own_name'])
df.head()

Unnamed: 0,pid,own_name,own_add,own_city,own_state,own_zip,own_country,use,city,year,own_name_clean
16,102_A_1,DEIRMENJIAN REAL ESTATE LLC,6 WINCHESTER ST,MEDFORD,MA,2155,,1110,SOMERVILLE,2019,DEIRMENJIAN
17,102_A_2,FALCETANO WILLIAM TRUSTEE,337 SOMERVILLE AVE 2ND FLOOR,SOMERVILLE,MA,2143,USA,1110,SOMERVILLE,2019,FALCETANO WILLIAM
18,102_A_3,GALLUZZO ELLEN FOR LIFE,9 AUSTIN ST,SOMERVILLE,MA,2145,USA,1040,SOMERVILLE,2019,GALLUZZO ELLEN
19,102_A_4,SOUSA JOSE C & MATILDE,23 BENEDICT ST,SOMERVILLE,MA,2145,USA,1050,SOMERVILLE,2019,SOUSA JOSE C & MATILDE
20,102_B_1,VALENCIA MILAGRO TRUSTEE,18 BENEDICT ST,SOMERVILLE,MA,2145,,1050,SOMERVILLE,2019,VALENCIA MILAGRO


In [72]:
import dedupe

# settings_file = 'csv_example_learned_settings'
# training_file = 'csv_example_training.json'

fields = [
    {'field': 'own_name_clean', 'type': 'String'},
#     {'field': 'Phone', 'type': 'String', 'has missing': True},
    ]

df_dict = df.to_dict('index')

deduper = dedupe.Dedupe(fields)

deduper.prepare_training(df_dict)

INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (firstTokenPredicate, own_name_clean), SimplePredicate: (twoGramFingerprint, own_name_clean))


In [73]:
dedupe.console_label(deduper)

own_name_clean : NORCROSS BOYD A & MARY J EE

own_name_clean : NORCROSS BOYD A & MARY J E

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


y


own_name_clean : LENNON CATHERINE EE

own_name_clean : LENNON CATHERINE 

1/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameSevenCharStartPredicate, own_name_clean), SimplePredicate: (sortedAcronym, own_name_clean))
own_name_clean : TOURO INVESTMENTS #9 

own_name_clean : TOURO INVESTMENTS 

2/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (oneGramFingerprint, own_name_clean), SimplePredicate: (sameFiveCharStartPredicate, own_name_clean))
own_name_clean : NISSENBAUM ALAN JAY

own_name_clean : NISSENBAUM ALAN J

3/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, own_name_clean), SimplePredicate: (sameSevenCharStartPredicate, own_name_clean))
own_name_clean : PRESIDENT & FELLOWS OF HARVARD

own_name_clean : PRESIDENT & FELLOWS OF HARVARD COLLEGE

4/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


own_name_clean : MCFADDEN FRANK & BRIDGEEN

own_name_clean : MCFADDEN FRANK & BRIDGEEN M EES

5/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, own_name_clean), SimplePredicate: (firstTokenPredicate, own_name_clean))
own_name_clean : DKA  

own_name_clean : DKA  

6/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


own_name_clean : CRAIGIE 

own_name_clean : CRAIGIE 

7/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, own_name_clean), SimplePredicate: (sameSevenCharStartPredicate, own_name_clean))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, own_name_clean), SimplePredicate: (firstTokenPredicate, own_name_clean))
own_name_clean : GUIGLI, MICHAEL

own_name_clean : GUIGLI, MICHAEL S.

8/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


own_name_clean : 53 CURTIS AVE 

own_name_clean : 53 CURTIS AVENUE 

9/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, own_name_clean), SimplePredicate: (firstTokenPredicate, own_name_clean))
INFO:dedupe.training:(SimplePredicate: (firstTokenPredicate, own_name_clean), SimplePredicate: (twoGramFingerprint, own_name_clean))
own_name_clean : DIBENEDETTO MARIE 

own_name_clean : DIBENEDETTO MARIO A

10/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : WAHN SCHAFFT OLIVER M

own_name_clean : WAHNSCHAFFT OLIVER M

10/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


own_name_clean : DIMASCIO IDA

own_name_clean : DIMASCIO  

11/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, own_name_clean), SimplePredicate: (sameSevenCharStartPredicate, own_name_clean))
INFO:dedupe.training:(SimplePredicate: (sameThreeCharStartPredicate, own_name_clean), SimplePredicate: (twoGramFingerprint, own_name_clean))
own_name_clean : LIN FENG

own_name_clean : LIN PENG & CAI MI

11/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


own_name_clean : COSTAS PATRICIA A

own_name_clean : COSTA  

12/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (firstTokenPredicate, own_name_clean), SimplePredicate: (sameThreeCharStartPredicate, own_name_clean))
INFO:dedupe.training:(SimplePredicate: (sameThreeCharStartPredicate, own_name_clean), SimplePredicate: (twoGramFingerprint, own_name_clean))
own_name_clean : ANDERSON, GEORGE MCCULLOUGH IV &

own_name_clean : ANDERS, GEORGE M.

12/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


own_name_clean : 1008 MASSACHUSETTS AVENUE .

own_name_clean : 1979 MASSACHUSETTS AVENUE, 

13/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameThreeCharStartPredicate, own_name_clean), SimplePredicate: (tokenFieldPredicate, own_name_clean))
own_name_clean : 1008 MASSACHUSETTS AVENUE .

own_name_clean : 1979 MASSACHUSETTS AVENUE, 

13/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : CHOU YEE

own_name_clean : CHO  

13/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : MCCOY PROPERTIES 

own_name_clean : MAC PROPERTIES 

13/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : FERU FREDERIC

own_name_clean : LEVY, FREDERICK R. & JUDITH PERLSTEIN

13/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : YANG MO

own_name_clean : YANG, MO

13/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


own_name_clean : MCLAUGHLIN, KEVIN

own_name_clean : MCLAUGHLIN KEVIN J

14/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


own_name_clean : ROBERTS, MARTIN

own_name_clean : ROBERTS, KATHERINE D. , JOHN & ALISON

15/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : KORDA CHRISTOPHER V

own_name_clean : KEILY CHRISTOPHER J & ELISE M

15/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : HARRINGTON KAREN 

own_name_clean : HARRINGTON, DANIEL T. & LINDA TARANTINO

15/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : MIRZA MUSHTAQUE ALI KHAN EE

own_name_clean : MIRZA, MUSHTAQUE ALIKHAN, 

15/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


own_name_clean : SILVA  2 

own_name_clean : SILVA FERNANDO & MARIA

16/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


own_name_clean : GRAHAM, WILLIAM A. & BARBARA S. GRAHAM

own_name_clean : GRAHAM LINDA

16/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


own_name_clean : MURPHY WILLIAM

own_name_clean : MURPHY LIAM

16/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : CAMBRIDGE AFFORDABLE HOUSING CORP

own_name_clean : CAMBRIDGE DAWN 

16/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : MILLER CHRISTOPHER & SAMANTHA ROSS MILLE

own_name_clean : MILLER JOAN M

16/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : SOUSA LAURA 

own_name_clean : SOUSA, MARIA I. & MANUEL R. SOUSA

16/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : KIERCE   

own_name_clean : KIERCE PATRICK J JR

16/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


own_name_clean : LIU, HONG & TIEMAE ROQUERRE

own_name_clean : LIU, HONG & TIEQUIN ROQUERRE

16/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


own_name_clean : CARREIRO LAURA

own_name_clean : CARREIRO GIL

17/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : YOGEL,  DAVID M. 

own_name_clean : YOGEL, DAVID M  OF

17/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


own_name_clean : ANDERSON ERIK A

own_name_clean : ANDERSON, THOMAS D. & EVELYN G. ANDERSON

18/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : LIN, YI

own_name_clean : LIN, MARIA

18/10 positive, 15/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : GALLAGHER HOLLY AVE 

own_name_clean : GALLAGHER COREY ANN

18/10 positive, 16/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : 138 ANTRIM STREET 

own_name_clean : 138 WINTHROP STREET CONDOMINIUM

18/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : FERNANDEZ LUIS

own_name_clean : FERNANDEZ STEVEN

18/10 positive, 18/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : ALVES JOHN P & MARIA M S

own_name_clean : ALVES JOSEPH M

18/10 positive, 19/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : CHAVES JORGE 

own_name_clean : CHAVES EDUARDO J & ERMELINDA L

18/10 positive, 20/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : JOHNSON ENA G

own_name_clean : JOHNSON, NATASHA

18/10 positive, 21/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : PAPALUCA PATRICIA M 

own_name_clean : PAPPAS PATRICIA M

18/10 positive, 22/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : TOLMAN, WARREN E. & CAROLYN B. TOLMAN

own_name_clean : TOLMAN, CAROLYN B. & WARREN E. TOLMAN

18/10 positive, 23/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


own_name_clean : MARTINS, ALFREDO D.

own_name_clean : MARTINS, ALFRED JR.

19/10 positive, 23/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


own_name_clean : HARDING, DONALD &  SHIRLEY A HARDING

own_name_clean : HARDING GLENN S

19/10 positive, 23/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : MCCARTHY MAURA

own_name_clean : MCCARTHY CHRISTOPHER

19/10 positive, 24/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : THE ROSANNE IWANICKI LIVING 

own_name_clean : THE SUN 1 

19/10 positive, 25/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : BELL, STEPHEN A. & AMAYA O. BELL

own_name_clean : BELL PEGGY

19/10 positive, 26/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : LACROIX DAVID W

own_name_clean : LACROIX, DIDIER & AGNES LACROIX

19/10 positive, 27/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : SMITH DAVID M

own_name_clean : SMITH, ALEXANDRA G. & STEPHEN R. ODOM

19/10 positive, 28/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : DOMINIQUE PATTIN LIVING 

own_name_clean : DOMINIQUE JEAN W

19/10 positive, 29/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : HUANG, JING FENG & JING YAO ZHANG

own_name_clean : HUANG, CHANGLIN

19/10 positive, 30/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : GILMAN, ARTHUR L., 

own_name_clean : GILMAN, RICHARD C.,  OF

19/10 positive, 31/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : WANG, KAIYAN

own_name_clean : WANG, KATHRYN

19/10 positive, 32/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : CHOW, SHEUNG H., HUI-LING CHOW,

own_name_clean : CHOW WENDY

19/10 positive, 33/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : DYER, RICHARD

own_name_clean : DYER, GEORGE & MARTHA DYER

19/10 positive, 34/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : YOGEL, DAVID M. 

own_name_clean : YOGEL, ERIC 

19/10 positive, 35/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : DOHERTY PETER & ABRAHAMS JODI N

own_name_clean : DOHERTY JOHN J JR

19/10 positive, 36/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : HILL, CAROL

own_name_clean : HILL, CLAUDIA

19/10 positive, 37/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : HARPER, G. NEIL & ANNE YOST HARPER

own_name_clean : HARPER, ROBIN D.

19/10 positive, 38/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : SINGH, GEETA & DEEPAK SINGH

own_name_clean : SINGH, MOHAN, 

19/10 positive, 39/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : LAMBERT, PETER J.& YUYING XIE

own_name_clean : LAMBERT, DEBRA BIBA

19/10 positive, 40/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : THE CATHERINE T LAHIFF 

own_name_clean : THE SHINE  

19/10 positive, 41/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : RHOADS VICTOR

own_name_clean : RHOADS, JOHN G. & SUSAN S. BEAN

19/10 positive, 42/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : MCSHEFFERY, JAMES, &

own_name_clean : MCSHEFFERY LAWRENCE & CHRISTINA

19/10 positive, 43/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : SULLIVAN MICHAEL A & DIANNE D

own_name_clean : SULLIVAN RICHARD C & ROSALIE C

19/10 positive, 44/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : OLEARY, SUSAN

own_name_clean : OLEARY, ROBERT &

19/10 positive, 45/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : HOFFMAN, ANNE

own_name_clean : HOFFMAN ALFRED J

19/10 positive, 46/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : OBRIEN, KATHERINE T. & DANIEL E. O'BRIE

own_name_clean : OBRIEN ROBERT LEO

19/10 positive, 47/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : HARRIS MARY 

own_name_clean : HARRIS, JOY

19/10 positive, 48/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : CHEN, LINAN

own_name_clean : CHEN WILLIAM K & HENRY G

19/10 positive, 49/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


own_name_clean : THE 60 FOUNTAIN STREET  

own_name_clean : THE SUN 1 

19/10 positive, 50/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling


In [74]:
deduper.train()
with open('csv_example_training.json', 'w') as tf:
    deduper.write_training(tf)
with open('settings', 'wb') as sf:
    deduper.write_settings(sf)

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.001000, score 0.763677932268224
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, own_name_clean), SimplePredicate: (tokenFieldPredicate, own_name_clean))


In [75]:
print('Clustering...')
clustered_dupes = deduper.partition(df_dict, 0.5)
print('# duplicate sets', len(clustered_dupes))

Clustering...


INFO:dedupe.blocking:10000, 0.1782162 seconds
INFO:dedupe.blocking:20000, 0.3309222 seconds
INFO:dedupe.blocking:30000, 0.4865912 seconds
INFO:dedupe.blocking:40000, 0.6323362 seconds
INFO:dedupe.blocking:50000, 0.7865862 seconds


# duplicate sets 47133


In [87]:
rid = []
clst = []
conf = []
count = []
for cluster_id, (records, scores) in enumerate(clustered_dupes):
    for record_id, score in zip(records, scores):
        count.append(len(records))
        rid.append(record_id)
        clst.append(cluster_id)
        conf.append(score)
        
clust = pd.DataFrame(list(zip(clst, conf, count)), 
                  columns =['clst', 'conf', 'count'],
                  index = rid
                 )
df = df.join(clust)

In [82]:
df[df['own_name']  5]

Unnamed: 0,pid,own_name,own_add,own_city,own_state,own_zip,own_country,use,city,year,own_name_clean,clst,conf,count
149,103_C_20,VENEZIANO JAMES M,PO BOX 556,SOMERVILLE,MA,02143,USA,1050,SOMERVILLE,2019,VENEZIANO JAMES M,159,0.977959,9
150,103_C_21,VENEZIANO JAMES M TRUSTEE,PO BOX 556,SOMERVILLE,MA,02143,USA,1110,SOMERVILLE,2019,VENEZIANO JAMES M,159,0.980873,9
151,103_C_22,VENEZIANO JAMES TRUSTEE,PO BOX 556,SOMERVILLE,MA,02143,USA,1050,SOMERVILLE,2019,VENEZIANO JAMES,159,0.976838,9
158,103_C_26,VENEZIANO JAMES M TRUSTEE,PO BOX 556,SOMERVILLE,MA,02143,USA,1110,SOMERVILLE,2019,VENEZIANO JAMES M,159,0.980873,9
234,103_G_13,DIRENZO R & IOLANDA & MCKENZIE D TRUSTEE,17 ROYALL ST,MEDFORD,MA,02155,,1110,SOMERVILLE,2019,DIRENZO R & IOLANDA & MCKENZIE D,242,0.982204,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63975,X-10_128,BARROS JOSE D & MARIA,252 HIGH ST,MEDFORD,MA,02155,USA,1040,MEDFORD,2019,BARROS JOSE D & MARIA,197,0.687651,6
64070,X-10_96,SANTOS MANUEL C,20 HIGGINS AVE,MEDFORD,MA,02155,USA,1040,MEDFORD,2019,SANTOS MANUEL C,42,0.915227,6
64272,Y-10_23,LACOURT FOUNDATION LLC,30 COLLEGE AVE,SOMERVILLE,MA,02144,USA,1040,MEDFORD,2019,LACOURT FOUNDATION,1254,0.976714,20
64324,Y-10_59,MAHER DAVID F,962 BROADWAY,SOMERVILLE,MA,02144,USA,1040,MEDFORD,2019,MAHER DAVID F,1202,0.825011,9


In [91]:
df['pid'] = df['pid'].replace('_', '-', regex=True)
df.set_index('pid')
df.head()
df.to_csv('parcels_clustered.csv')