In [1]:
import pandas as pd
import numpy as np

import gc, math, numbers, operator, os, re, time
from collections import defaultdict, Counter
from statistics import stdev

from fuzzywuzzy import fuzz
from jellyfish import jaro_winkler, metaphone
from py_common_subseq import find_common_subsequences

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import networkx as nx

def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3 

  from numpy.core.umath_tests import inner1d


In [2]:
#load new training records here

new_training_filepath = 'data/Training Data Patcher/NCMA training examples.csv'
file_utf = new_training_filepath[:-(len(new_training_filepath)-(new_training_filepath.find('.',len(new_training_filepath)-6)))] + '_utf' + new_training_filepath[-(len(new_training_filepath)-(new_training_filepath.find('.',len(new_training_filepath)-6))):]
file_utf

import io
import shutil

with io.open(new_training_filepath, encoding='utf-8', errors='ignore') as source:
    with io.open(file_utf, mode='w', encoding='utf-8') as target:
        shutil.copyfileobj(source,target)
        
new_training = pd.read_csv(file_utf,dtype=object,converters={'zip_l': lambda x: str(x),'zip_r': lambda x: str(x)}
                         #,nrows=1000
                         )
'''
#could use this code in case we need to add a feature to training data, but it's not needed otherwise given intersection()
org_training_cols = ['id','id_l','id_r','id_l count', 'is_match']
#in case we add any other features to training data, add them to below list
for col in ['org_name','city','state','zip','web','phone','email']
    org_training_cols.append:
        col + '_l'
        col + '_r'

person_training_cols = ['id','id_l','id_r','org_person_id_l','org_person_id_r', 'is_match']
#in case we add any other features to training data, add them to below list
for col in ['prefix','fname','mname','lname','suffix','gender','org_name','position','city','state','zip','email','phone','web']:
    person_training_cols.append:
        col + '_l'
        col + '_r'
'''

#determine if incoming data is for organization matching or person matching
if 'fname_l' in new_training:
    organization_only = False
else:
    organization_only = True
        
if organization_only:
    current_training_records = pd.read_csv('org_training_featureless.csv',dtype=object,converters={'zip_l': lambda x: str(x),'zip_r': lambda x: str(x)})
    #combine current training records with new training records    
    featureless = pd.concat([current_training_records,new_training[intersection(list(current_training_records.columns),list(new_training.columns))]],sort=False)
    featureless.drop_duplicates(inplace=True) #make sure we haven't added multiple rows for the same two records
    featureless.to_csv('org_training_featureless.csv',index=False) #save a new set of featureless for the next time
else:
    with io.open('person_training_featureless.csv', encoding='utf-8', errors='ignore') as source:
        with io.open('person_training_featureless_utf.csv', mode='w', encoding='utf-8') as target:
            shutil.copyfileobj(source,target)
    current_training_records = pd.read_csv('person_training_featureless_utf.csv',dtype=object,converters={'zip_l': lambda x: str(x),'zip_r': lambda x: str(x)})
    #combine current training records with new training records    
    featureless = pd.concat([current_training_records,new_training[intersection(list(current_training_records.columns),list(new_training.columns))]],sort=False)
    featureless.drop_duplicates(inplace=True) #make sure we haven't added multiple rows for the same two records
    featureless.to_csv('person_training_featureless.csv',index=False) #save a new set of featureless for the next time

  
  


In [3]:
#run to load featureless data and not add any new records, used for tweaking the features
try:
    new_training
    new=True
except:
    import io
    import shutil
    #I think this file needs to be updated to reference the current person_training_featureless
    with io.open('person_training_featureless_reduced.csv', encoding='utf-8', errors='ignore') as source:
        with io.open('person_training_featureless_utf.csv', mode='w', encoding='utf-8') as target:
            shutil.copyfileobj(source,target)
    featureless = pd.read_csv('person_training_featureless_utf.csv',dtype=object,converters={'zip_l': lambda x: str(x),'zip_r': lambda x: str(x)})
    new=False
    organization_only = False

In [4]:
if new:
    intersection(list(new_training.columns),list(current_training_records.columns))

In [5]:
featureless.head()

Unnamed: 0,id,id_l,org_person_id_l,id_r,org_person_id_r,is_match,prefix_l,fname_l,mname_l,lname_l,...,gender_r,org_name_r,position_r,city_r,state_r,zip_r,email_r,phone_r,web_l,web_r
0,874427,106963,96887,216622,209040,1,,nathalie,,leplat,...,F,ca applicants' attorney association,"Administrative Assistant, Hotel Contact and CA...",sacramento,ca,95814.0,nathalie@caaa.org,9164445155.0,,
1,1404715,206379,198238,254014,520751,1,,karianne,,fallow,...,F,dairy west,Chief Executive Officer,meridian,id,83642.0,kfallow@dairywest.com,2083277050.0,,
2,94393,277502,358545,328968,322757,1,,amy,,latessa,...,,american farm bureau federation,"Director, Show",washington,dc,20024.0,amy.latessa@ideaggroup.com,8156213254.0,,
3,1419406,288744,281663,288745,281664,1,,james m.,,gates,...,,ny on shrine association,Treasurer,,,,jmgates46@twcny.rr.com,,,
4,409311,278524,306589,357430,271213,1,,evan,,lynch,...,M,international clarinet association,Interim Executive Director,columbus,oh,43214.0,evanlynchica@gmail.com,8889835441.0,,


In [6]:
featureless.shape

(99871, 34)

In [7]:
#Preprocessing items that need to happen before we can do scoring

featureless = featureless.dropna(how='all')                    # Drop all empty rows
featureless = featureless.fillna('')                           # Make any NA an empty string
featureless = featureless.applymap(str.lower)                  # Lowercase all fields
featureless = featureless.applymap(str.strip)                  # Strip whitespace from all fields

"""
Prepare the places lookup table

"""
# Get and lowercase the state codes data
df_states = pd.read_csv('data/state_lkup.csv', keep_default_na=False)
df_states = df_states.applymap(str.lower)
df_states = df_states.set_index("state")

# Get and lowercase the country codes data
df_countries = pd.read_csv('data/country codes.csv', keep_default_na=False)
df_countries = df_countries.applymap(str.lower)
df_countries = df_countries.set_index("COUNTRY")

place_acronyms = defaultdict(str,
                             {**df_states["acronym"].to_dict(),
                              **df_countries["ISO2"].to_dict()})

for col in ['state_l','state_r', 'org_name_l','org_name_r']:
    featureless[col] = featureless[col].replace(regex=place_acronyms)
    
## Patch in nicknames
if 'fname_l' in featureless:
    # Load in nicknames
    df_nicks = pd.read_csv('data/nicknames.csv', sep='\n', header=None, names=["names"])
    df_nicks["names"] = df_nicks["names"].str.lower().str.split(',')
    nicks = defaultdict(set)  # A dictionary with a default value of an empty set
    df_nicks.apply(lambda row: list(map(lambda name: nicks[name].add(row.name), row["names"])),
                   axis = "columns")
    
    featureless['nicks_groups_l'] = featureless["fname_l"].apply(lambda n: " ".join(map(lambda grp: "nick" + str(grp), nicks[n])))
    featureless['nicks_groups_r'] = featureless["fname_r"].apply(lambda n: " ".join(map(lambda grp: "nick" + str(grp), nicks[n])))

    #adding full name cols
    featureless['full_name_l'] = featureless['fname_l'] + ' ' + featureless['lname_l']
    featureless['full_name_r'] = featureless['fname_r'] + ' ' + featureless['lname_r']

#adding clean phone cols
if 'clean_phone_l' not in featureless:
    featureless['clean_phone_l'] = featureless['phone_l'].replace('[^0-9]', '',regex=True)
    featureless['clean_phone_r'] = featureless['phone_r'].replace('[^0-9]', '',regex=True)

#adding before email & domain cols
personal_domains = pd.read_csv('data/personal email domains.csv')['domain']

if 'email_l' in featureless:
    featureless[['before_domain_l', 'domain_l']] = featureless['email_l'].str.split('@', expand=True)[[0, 1]]
    featureless['domain_l'] = featureless['domain_l'].map(lambda d: '' if d in personal_domains else d)
    
    featureless[['before_domain_r', 'domain_r']] = featureless['email_r'].str.split('@', expand=True)[[0, 1]]
    featureless['domain_r'] = featureless['domain_r'].map(lambda d: '' if d in personal_domains else d)

if 'web_l' in featureless and organization_only == True:
    for col in ['web_l','web_r']:   
        domains = []
        for web in featureless[col]:
            if '@' in web: #handle domain extraction in case we want to use an email address
                domain = web.split('@')[-1]
                if domain in list(personal_domains): 
                    domains.append('')
                else:
                    domains.append(domain)
            elif '/' in web or 'www' in web: #assuming we have a URL
                domains.append(web.split('//')[-1].split('/')[0].strip('www.'))
            elif '.' in web: #handling cases where URL is already a domain
                domains.append(web)
            else:
                domains.append('')
        col_name = 'domain_' + col[-1]
        if col_name in list(featureless.columns): #handle if we already have domain column from emails
            complete_domains = []
            domain_df = pd.DataFrame(domains,columns=['website_domain'])
            domain_df['email_domain'] = featureless[col_name]
            for index, row in domain_df.iterrows():
                if len(row['website_domain']) > 0:
                    complete_domains.append(row['website_domain']) #prefer website domain
                else:
                    complete_domains.append(row['email_domain'])
            featureless[col_name] = complete_domains
        else: #in case there is no email column, then we take all domains from website
            featureless[col_name] = domains 

In [8]:
#scoring match candidates based on edit distance of org names
def score_compare(left, right, method):
    """
    Given two strings, returns the comparison score based on the specified methodology.
    """
    if len(left) > 0 and len(right) > 0:
        if (method == "jaro"): return jaro_winkler(left, right)
        if (method == "fuzz-partial"): return fuzz.partial_ratio(left, right) / 100
        if (method == "fuzz-sort"): return fuzz.token_sort_ratio(left, right) / 100
        if (method == "fuzz-set"): return fuzz.token_set_ratio(left, right) / 100
        raise ValueError("Method {} unknown; " \
                         "must be 'jaro', 'fuzz-partial', 'fuzz-sort' or 'fuzz-set'".format(method))
    else: return np.nan

def get_field(data, field, left_suffix='_l', right_suffix='_r'):
    """
    For a list of series, this function returns a list of fields from those series.
    So field_of('name', [row_bob, row_joe]) might return ["Bob", "Joe"].
    """
    return (data[field + left_suffix], data[field + right_suffix])

def org_name_score(df):
    jaro_time = time.time()
    df['jaro_score'] = df.apply(lambda row:
        score_compare(*get_field(row, 'org_name'), 'jaro'),
        axis="columns")
    print("\tjaro scores done --- %s seconds ---" % (time.time() - jaro_time))
    
    # consider skipping these; they have no major effect on outcomes
    '''
    partial_time = time.time()
    df['fuzz_partial_score'] = df.apply(lambda row:
        score_compare(*get_field(row, 'org_name'), 'fuzz-partial'),
        axis="columns")
    print("\tfuzz partial scores done --- %s seconds ---" % (time.time() - partial_time))

    sort_time = time.time()
    df['fuzz_sort_score'] = df.apply(lambda row:
        score_compare(*get_field(row, 'org_name'), 'fuzz-sort'),
        axis="columns")
    print("\tfuzz sort scores done --- %s seconds ---" % (time.time() - sort_time))

    set_time = time.time()
    df['fuzz_set_score'] = df.apply(lambda row:
        score_compare(*get_field(row, 'org_name'), 'fuzz-set'),
        axis="columns")
    print("\tfuzz set scores done --- %s seconds ---" % (time.time() - set_time))
    '''

    return df

org_name_score(featureless)

	jaro scores done --- 11.462361097335815 seconds ---


Unnamed: 0,id,id_l,org_person_id_l,id_r,org_person_id_r,is_match,prefix_l,fname_l,mname_l,lname_l,...,nicks_groups_r,full_name_l,full_name_r,clean_phone_l,clean_phone_r,before_domain_l,domain_l,before_domain_r,domain_r,jaro_score
0,874427,106963,96887,216622,209040,1,,nathalie,,leplat,...,,nathalie leplat,nathalie leplat,,9164445155,nathalie,caaa.org,nathalie,caaa.org,1.000000
1,1404715,206379,198238,254014,520751,1,,karianne,,fallow,...,,karianne fallow,karianne fallow,,2083277050,kfallow,udidaho.org,kfallow,dairywest.com,0.560317
2,94393,277502,358545,328968,322757,1,,amy,,latessa,...,nick64,amy latessa,amy latessa,,8156213254,amy.latessa,ideaggroup.com,amy.latessa,ideaggroup.com,1.000000
3,1419406,288744,281663,288745,281664,1,,james m.,,gates,...,,james m. gates,james m. gates,,,jmgates46,twcny.rr.com,jmgates46,twcny.rr.com,1.000000
4,409311,278524,306589,357430,271213,1,,evan,,lynch,...,nick390,evan lynch,evan lynch,,8889835441,evanlynchica,gmail.com,evanlynchica,gmail.com,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99866,,e17bca27-82c2-4510-907a-847dcd1f968b,,1484c243-1d4e-43d1-bf6a-5aeb6d039677,,0,,george,r.,powers,...,nick430 nick431,george powers,george powell,7032988425,4152815412,george.r.powers,gmail.com,george.powell,stantec.com,
99867,,5077450f-d025-471c-ac08-fef9165e638a,,b6649696-43de-41f9-8914-cb8e6b20ebd5,,0,,william,c.,leslie,...,nick1062 nick1063 nick1065 nick153 nick154,william leslie,william leslie,,9372552783,wcl1043,aol.com,william.leslie2,wpafb.af.mil,
99868,,1caba537-86de-4d2c-8d58-031c41b179fd,,b7a5ad71-ddfb-4731-96c6-9be13f9b4576,,0,,steven,l,ingle,...,nick969 nick970,steven ingle,steven ingle,9564954653,2085262720,pingse,aol.com,steven.ingle,inl.gov,
99869,,3efcda76-3115-4bdb-8b27-c7947058c659,,,,0,,andrew,t.,habina,...,nick68 nick69 nick70 nick311,andrew habina,andrew habina,6033510779,6172582127,ahabina,draper.com,,,


In [9]:
org_names_cnt = Counter()
person_names_cnt = Counter()

def tokenize(cell):
    """
    Reduces cell string contents to lowercase
    alphanumeric characters, then splits into a list on space.
    
    So given one string "  Aaa bBb ccC ", this returns ["aaa", "bbb", "ccc"]
    """
    if cell is None: return cell
    # For every lowercased word (from .lower.split), filter out non-alphanumeric chars
    # and then ''.join it back together to get a list of tokens.
    return map(lambda string: ''.join(filter(str.isalnum, string)), re.split('\W+',cell.lower()))

all_orgs = pd.DataFrame(list(featureless.org_name_l) + list(featureless.org_name_r),columns=['org_name'])

def count_orgnames(df):
    df['org_name'].dropna().apply(lambda c: org_names_cnt.update(tokenize(c)))

count_orgnames(all_orgs)
    
def org_name_similarity(left, right):
    def org_sequence_uniqueness(seq):
        try:
            return sum(1 / org_names_cnt[t.lower()] ** 0.5 for t in seq)
        except:
            print(seq, org_names_cnt)
            raise ValueError()

    if len(left) > 0 and len(right) > 0:
        left_toks = set(tokenize(left))
        right_toks = set(tokenize(right))

        left_uniq = org_sequence_uniqueness(left_toks)
        right_uniq = org_sequence_uniqueness(right_toks)

        return org_sequence_uniqueness(left_toks & right_toks) / (left_uniq * right_uniq) ** 0.5
    else: return np.nan

if not organization_only:
    all_persons = pd.DataFrame(list(featureless.full_name_l) + list(featureless.full_name_r),columns=['full_name'])

    def count_personnames(df):
        df['full_name'].dropna().apply(lambda c: person_names_cnt.update(tokenize(c)))
    count_personnames(all_persons)
       
    def person_name_similarity(left, right):
        def person_sequence_uniqueness(seq):
            try:
                return sum(1 / person_names_cnt[t.lower()] ** 0.5 for t in seq)
            except:
                print(seq, person_names_cnt)
                raise ValueError()

        if len(left) > 0 and len(right) > 0:
            left_toks = set(tokenize(left))
            right_toks = set(tokenize(right))

            left_uniq = person_sequence_uniqueness(left_toks)
            right_uniq = person_sequence_uniqueness(right_toks)

            return person_sequence_uniqueness(left_toks & right_toks) / (left_uniq * right_uniq) ** 0.5
        else: return np.nan

def calculate_unique(df):
    df['org_uniq'] = df.apply(lambda row:
        org_name_similarity(*get_field(row, 'org_name')),
        axis="columns")
    if not organization_only:
        df['person_uniq'] = df.apply(lambda row:
            person_name_similarity(*get_field(row, 'full_name')),
            axis="columns")
    return df

calculate_unique(featureless)

Unnamed: 0,id,id_l,org_person_id_l,id_r,org_person_id_r,is_match,prefix_l,fname_l,mname_l,lname_l,...,full_name_r,clean_phone_l,clean_phone_r,before_domain_l,domain_l,before_domain_r,domain_r,jaro_score,org_uniq,person_uniq
0,874427,106963,96887,216622,209040,1,,nathalie,,leplat,...,nathalie leplat,,9164445155,nathalie,caaa.org,nathalie,caaa.org,1.000000,1.0,1.000000
1,1404715,206379,198238,254014,520751,1,,karianne,,fallow,...,karianne fallow,,2083277050,kfallow,udidaho.org,kfallow,dairywest.com,0.560317,0.0,1.000000
2,94393,277502,358545,328968,322757,1,,amy,,latessa,...,amy latessa,,8156213254,amy.latessa,ideaggroup.com,amy.latessa,ideaggroup.com,1.000000,1.0,1.000000
3,1419406,288744,281663,288745,281664,1,,james m.,,gates,...,james m. gates,,,jmgates46,twcny.rr.com,jmgates46,twcny.rr.com,1.000000,1.0,1.000000
4,409311,278524,306589,357430,271213,1,,evan,,lynch,...,evan lynch,,8889835441,evanlynchica,gmail.com,evanlynchica,gmail.com,1.000000,1.0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99866,,e17bca27-82c2-4510-907a-847dcd1f968b,,1484c243-1d4e-43d1-bf6a-5aeb6d039677,,0,,george,r.,powers,...,george powell,7032988425,4152815412,george.r.powers,gmail.com,george.powell,stantec.com,,,0.298004
99867,,5077450f-d025-471c-ac08-fef9165e638a,,b6649696-43de-41f9-8914-cb8e6b20ebd5,,0,,william,c.,leslie,...,william leslie,,9372552783,wcl1043,aol.com,william.leslie2,wpafb.af.mil,,,1.000000
99868,,1caba537-86de-4d2c-8d58-031c41b179fd,,b7a5ad71-ddfb-4731-96c6-9be13f9b4576,,0,,steven,l,ingle,...,steven ingle,9564954653,2085262720,pingse,aol.com,steven.ingle,inl.gov,,,1.000000
99869,,3efcda76-3115-4bdb-8b27-c7947058c659,,,,0,,andrew,t.,habina,...,andrew habina,6033510779,6172582127,ahabina,draper.com,,,,,1.000000


In [10]:
def name_match(left, right):
    left_isalnum = ''.join(e for e in left if e.isalnum()) #removing spaces and special characters from left and right
    right_isalnum = ''.join(e for e in right if e.isalnum())
    if len(left_isalnum) > 0 and len(right_isalnum) > 0:
        return score_compare(left_isalnum, right_isalnum, 'jaro')
    #elif len(left) > 0 and len(right) > 0: #we may as well still score two orgs named '$$#' and '#$$'
        #return score_compare(left, right, 'jaro')
    else: return np.nan

def word_match(left, right): #same as name match but does not remove spaces and special characters
    if len(left) > 0 and len(right) > 0:
        return score_compare(left, right, 'jaro')
    else: return np.nan
    
def initial_match(left, right):
    if len(left) > 0 and len(right) > 0:
        if left[0] == right[0]:
            return 1
        else:
            return 0
    else:
        return np.nan
    
personal_suffixes = ['ii','iii','iv','v','jr','sr']
def suffix_match(left, right):
    if len(left) > 0 and len(right) > 0:
        l_suffix = left.replace('.','').replace(',','').split()
        l_personal_suffix = set.intersection(set(l_suffix),personal_suffixes)
        if len(l_personal_suffix) > 0:
            r_suffix = right.replace('.','').replace(',','').split()
            r_personal_suffix = set.intersection(set(r_suffix),personal_suffixes)
            if len(r_personal_suffix) > 0:
                if (len(set.intersection(l_personal_suffix,r_personal_suffix)) > 0):
                    return 1
                else:
                    return 0
            else:
                return np.nan
        else:
            return np.nan
    else:
        return np.nan

featureless = featureless.fillna('')   
def match_fields(candidata):
    if 'state_l' in candidata:
        start_time = time.time()
        print ("\tCHECKING FOR STATE CODE MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

        def state_match(left, right):
            if len(left) >=2 and len(right) >= 2:
                if left == right:
                    return 1
                else:
                    return 0
            else:
                return np.nan
            
        candidata['state_match'] = candidata.apply(lambda row:
            state_match(*get_field(row, 'state')),
            axis="columns")
        
        print("\tstate codes checked --- %s seconds ---" % round(time.time() - start_time, 2))

    if 'fname_l' in candidata and not organization_only:
        start_time = time.time()
        print ("\tCHECKING FOR FIRST NAME MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

        candidata['fname_match'] = candidata.apply(lambda row:
            name_match(*get_field(row, 'fname')),
            axis="columns")

        #check if first initials match
        candidata['f_initial_match'] = candidata.apply(lambda row:
            initial_match(*get_field(row, 'fname')),
            axis='columns')
        
        print("\tfname values compared --- %s seconds ---" % round(time.time() - start_time, 2))
        
        # If l_fname is present then the nick group should be too
        start_time = time.time()
        print("\tCHECKING FOR NICKNAME GROUP MATCHES...")

        # This is a little complicated, but essentially it counts how many nicks groups are shared:
        #
        # get_field(row, 'nicks_groups'):
        #     look up the data for each row, and get the nicks_groups from the left and right.
        # map(lambda nicks: set(nicks.split()), _):
        #     for the left and right nicks in nicks_groups, split it and create a set of nick groups
        # set.intersection(*_):
        #     pass these two sets to set.intersection to get the intersection of the two
        # len(_):
        #     take the length of the intersection
        
        #changed to a 0/1 flag rather then len of intersection
        def nick_match(left, right):
            if len(left) > 4 and len(right) > 4:
                if len(set.intersection(set(left.split()),set(right.split()))) > 0:
                    return 1
                else:
                    return 0
            else:
                return np.nan
        candidata['nick_matches'] = candidata.apply(lambda row:
                                                         nick_match(*get_field(row, 'nicks_groups')),
                                                         axis='columns')
        print('\tnickname group matches compared --- %s seconds ---' % round(time.time() - start_time, 2))

    if 'mname_l' in candidata and not organization_only:
        start_time = time.time()
        print ("\tCHECKING FOR MIDDLE INITIAL MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        
        candidata['m_initial_match'] = candidata.apply(lambda row:
            initial_match(*get_field(row, 'mname')),
            axis='columns')
        
        print("\tmname values compared --- %s seconds ---" % round(time.time() - start_time, 2))
        
    if 'lname_l' in candidata and not organization_only:
        start_time = time.time()
        print ("\tCHECKING FOR LAST NAME MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

        candidata['lname_match'] = candidata.apply(lambda row:
            name_match(*get_field(row, 'lname')),
            axis="columns")

        print("\tlname values compared --- %s seconds ---" % round(time.time() - start_time, 2))

    if 'suffix_l' in candidata and not organization_only:
        start_time = time.time()
        print ("\tCHECKING FOR SUFFIX MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        
        candidata['suffix_match'] = candidata.apply(lambda row:
            suffix_match(*get_field(row, 'suffix')),
            axis='columns')
        
        print("\tsuffix values compared --- %s seconds ---" % round(time.time() - start_time, 2))
    if 'city_l' in candidata:
        start_time = time.time()
        print ("\tCHECKING FOR CITY MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

        candidata['city_match'] = candidata.apply(lambda row:
            name_match(*get_field(row, 'city')),
            axis="columns")
        print("\tcity values compared --- %s seconds ---" % round(time.time() - start_time, 2))

    if 'zip_l' in candidata:
        start_time = time.time()
        print ("\tCHECKING FOR POSTAL CODE MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

        def postal_similarity(left, right):
            # if the number is too short, means it's fubar
            if len(left) >= 5 and len(right) >= 5:
                if max(len(sub) for sub in find_common_subsequences(left, right)) / 5 >= 1:
                    return 1
                else:
                    return 0
            else:
                return np.nan

        candidata['zip_match'] = candidata.apply(lambda row:
            postal_similarity(*get_field(row, 'zip')),
            axis="columns")

        print("\tpostal codes checked --- %s seconds ---" % round(time.time() - start_time, 2))

    if 'before_domain_l' in candidata and not organization_only:
        start_time = time.time()
        print ("\tCHECKING FOR BEFORE EMAIL DOMAIN MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

        candidata['before_domain_match'] = candidata.apply(lambda row:
            word_match(*get_field(row, 'before_domain')),
            axis="columns")

        print("\temail before domains checked --- %s seconds ---" % round(time.time() - start_time, 2))

    if 'domain_l' in candidata:
        start_time = time.time()
        print ("\tCHECKING FOR WEB DOMAIN MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

        def domain_match(left, right):
            if len(left) > 4 and len(right) > 4:
                if left == right:
                    return 1
                else:
                    return 0
            else:
                return np.nan

        candidata['domain_match'] = candidata.apply(lambda row:
            domain_match(*get_field(row, 'domain')),
            axis="columns")

        print("\tweb domains checked --- %s seconds ---" % round(time.time() - start_time, 2))

    if 'clean_phone_l' in candidata:
        start_time = time.time()
        print ("\tCHECKING FOR PHONE MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

        #scoring match candidates based on matching phone
        def phone_simularity(left, right):
            if len(left) > 9 and len(right) > 9:
                if max(len(sub) for sub in find_common_subsequences(left, right)) / 10 >= 1:
                    return 1
                else:
                    return 0
            else:
                return np.nan

        candidata['phone_match'] = candidata.apply(lambda row:
            phone_simularity(*get_field(row, 'clean_phone')),
            axis="columns")

        print("\tphones checked --- %s seconds ---" % round(time.time() - start_time, 2))
    
    return candidata

match_fields(featureless)

featureless.to_csv('training_records_w_features.csv')

	CHECKING FOR STATE CODE MATCHES...
	state codes checked --- 2.44 seconds ---
	CHECKING FOR FIRST NAME MATCHES...
	fname values compared --- 6.79 seconds ---
	CHECKING FOR NICKNAME GROUP MATCHES...
	nickname group matches compared --- 3.3 seconds ---
	CHECKING FOR MIDDLE INITIAL MATCHES...
	mname values compared --- 2.83 seconds ---
	CHECKING FOR LAST NAME MATCHES...
	lname values compared --- 4.22 seconds ---
	CHECKING FOR SUFFIX MATCHES...
	suffix values compared --- 2.97 seconds ---
	CHECKING FOR CITY MATCHES...
	city values compared --- 5.4 seconds ---
	CHECKING FOR POSTAL CODE MATCHES...
	postal codes checked --- 6.0 seconds ---
	CHECKING FOR BEFORE EMAIL DOMAIN MATCHES...
	email before domains checked --- 3.65 seconds ---
	CHECKING FOR WEB DOMAIN MATCHES...
	web domains checked --- 2.73 seconds ---
	CHECKING FOR PHONE MATCHES...
	phones checked --- 7.89 seconds ---


In [11]:
#impute null training feature values
possible_features = [
    'jaro_score',
    'fuzz_partial_score',
    'fuzz_sort_score',
    'fuzz_set_score',
    'org_uniq',
    'city_match',
    'state_match',
    'zip_match',
    'domain_match',
    'fname_match',
    'f_initial_match',
    'm_initial_match',
    'lname_match',
    'suffix_match',
    'person_uniq',
    'nick_matches',
    'phone_match',
    'before_domain_match',
]

# Get the columns that are features and are also present in our dataset right now
present_columns = pd.read_csv('training_records_w_features.csv', nrows=1).columns

present_features = present_columns[present_columns.isin(possible_features)]

#imputing np.nan values in ML features.  we need to do this because we want the model to treat differently
#cases where two records have a data type present and it DOES NOT MATCH and where one record is simply MISSING that data

#creating a dictionary we will use to determine the value to impute into feature NULLs
feature_avg_dict= {}
for feature in present_features:
    feature_df = pd.read_csv('training_records_w_features.csv',usecols=[feature])
    avg = feature_df[feature_df[feature].notnull()].mean()[0]
    feature_avg_dict.update({feature:avg})

def impute_nulls(candidata):
    for feature in present_features:
        candidata[feature] = candidata[feature].fillna(feature_avg_dict[feature])
        #candidata[feature] = candidata[feature].replace(np.nan,feature_avg_dict[feature])

    return candidata

impute_nulls(featureless)

if organization_only:
    featureless.to_csv('org_training_records_w_imputed_features.csv',index=False)
else:
    featureless.to_csv('person_training_records_w_imputed_features.csv',index=False)

In [12]:
features = featureless[['jaro_score', 'org_uniq', 'person_uniq', 'state_match',
       'fname_match', 'f_initial_match', 'nick_matches', 'm_initial_match',
       'lname_match', 'suffix_match', 'city_match', 'zip_match',
       'before_domain_match', 'domain_match', 'phone_match']]
features

Unnamed: 0,jaro_score,org_uniq,person_uniq,state_match,fname_match,f_initial_match,nick_matches,m_initial_match,lname_match,suffix_match,city_match,zip_match,before_domain_match,domain_match,phone_match
0,1,1,1.000000,0.406778,1.0,1,0.097181,0.122594,1.000000,0.082011,0.624292,0.154514,1.000000,1.000000,0.004071
1,0.560317,0,1.000000,1.000000,1.0,1,0.097181,0.122594,1.000000,0.082011,1.000000,1.000000,1.000000,0.000000,0.004071
2,1,1,1.000000,1.000000,1.0,1,1.000000,0.122594,1.000000,0.082011,1.000000,1.000000,1.000000,1.000000,0.004071
3,1,1,1.000000,0.406778,1.0,1,0.097181,0.122594,1.000000,0.082011,0.624292,0.154514,1.000000,1.000000,0.004071
4,1,1,1.000000,0.406778,1.0,1,1.000000,0.122594,1.000000,0.082011,0.624292,0.154514,1.000000,1.000000,0.004071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99866,,,0.298004,0.000000,1.0,1,1.000000,0.000000,0.866667,0.082011,0.405556,1.000000,0.915897,0.000000,0.000000
99867,,,1.000000,0.000000,1.0,1,1.000000,0.000000,1.000000,0.082011,0.500000,0.000000,0.473016,0.000000,0.004071
99868,,,1.000000,0.000000,1.0,1,1.000000,0.000000,1.000000,0.082011,0.573810,0.000000,0.416667,0.000000,0.000000
99869,,,1.000000,0.000000,1.0,1,1.000000,0.000000,1.000000,0.082011,0.671958,0.154514,0.473077,0.021356,0.000000


In [13]:
features = features[features.jaro_score !='']

In [14]:
features['jaro_score'] = features['jaro_score'].astype(float)
features['org_uniq'] = features['org_uniq'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [15]:
features.dtypes

jaro_score             float64
org_uniq               float64
person_uniq            float64
state_match            float64
fname_match            float64
f_initial_match          int64
nick_matches           float64
m_initial_match        float64
lname_match            float64
suffix_match           float64
city_match             float64
zip_match              float64
before_domain_match    float64
domain_match           float64
phone_match            float64
dtype: object

In [16]:
from sklearn import cluster
k=7
kmeans = cluster.KMeans(n_clusters=k)
kmeans.fit(features)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=7, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [17]:
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
inertia = kmeans.inertia_

print('Labels:', labels)
print('')
print('Centroids:', centroids)
print('')
print('Inertia:', inertia)

Labels: [6 6 6 ... 3 6 6]

Centroids: [[ 6.32186560e-01  4.93504081e-02  1.61408666e-02  3.34886186e-01
   6.10746614e-01  1.00000000e+00  4.68583690e-02  1.21126663e-01
   4.91928615e-01  8.20319573e-02  5.79800606e-01  5.56481072e-02
   5.07556387e-01  8.09169711e-03  2.56963140e-03]
 [ 5.97437281e-01  2.98571933e-02  2.24129534e-03  1.00000000e+00
   5.08079314e-01 -3.07809334e-14  4.93481628e-02  1.20628726e-01
   5.01490607e-01  8.21044673e-02  7.39842793e-01  1.21348707e-03
   4.62200845e-01  8.64873684e-03  1.96925487e-03]
 [ 6.32499569e-01  4.62711401e-02  4.38978872e-03 -1.43940415e-13
   5.06477458e-01 -4.97102359e-14  4.83309175e-02  1.20862661e-01
   5.03602747e-01  8.21160486e-02  4.86274670e-01  9.80785129e-03
   4.57034694e-01  7.37906471e-03  1.94828685e-03]
 [ 5.72964388e-01  1.48187242e-02  2.08545860e-01  2.84612205e-01
   9.80140530e-01  9.99742135e-01  1.00000000e+00  1.28423878e-01
   4.41934577e-01  7.62747939e-02  5.90874529e-01  9.42323248e-02
   5.14874328e-01

In [18]:
featureless_reduced = featureless[featureless.jaro_score !=''] #need to do this to avoid shape error
featureless_reduced['cluster_k7'] = labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [19]:
featureless_reduced.cluster_k7.value_counts()

2    36522
5    22850
1    16935
4     9531
0     8416
3     3878
6     1665
Name: cluster_k7, dtype: int64

In [20]:
featureless_reduced.cluster.value_counts()

AttributeError: 'DataFrame' object has no attribute 'cluster'

In [None]:
featureless_reduced[featureless_reduced.is_match == '1'].cluster_k7.value_counts()

In [None]:
featureless_reduced.to_csv('person training w imputed features & clustered k15.csv',index=False)