In [1]:
import pandas as pd
import numpy as np
import math
from jellyfish import jaro_winkler
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from py_common_subseq import find_common_subsequences
import numbers
import time
from collections import Counter 
from fuzzywuzzy import fuzz
import re

In [23]:
import io
import shutil

with io.open('all cupola orgs w phone.csv', encoding='utf-8', errors='ignore') as source:
    with io.open('all cupola orgs w phone_utf.csv', mode='w', encoding='utf-8') as target:
        shutil.copyfileobj(source,target)
        
with io.open('all_mint_orgs_2_19_2019.txt', encoding='utf-8', errors='ignore') as source:
    with io.open('all_mint_orgs_2_19_2019_utf.txt', mode='w', encoding='utf-8') as target:
        shutil.copyfileobj(source,target)

In [173]:
#define column names
l_id = 'organization_id'
l_name = 'org_name'
l_address1 = 'address1'
l_city = 'city'
l_state = 'state'
l_zip = 'postal_code'
l_phone = 'phone'
l_web = 'website'
l_acronym = 'acronym'
l_alt_name = 'alt_name'

r_id = 'rec_id'
r_name = 'org_name'
r_address1 = 'street1'
r_city = 'CITY'
r_state = 'STPROV'
r_zip = 'zip'
r_phone = 'PHONE'
r_web = 'url'
r_acronym = 'CBI_Acronym'

#set parameters
token_match_min = 3 # minimum number of matched tokens to be considered a match
token_limiter = .99 # percent of non-single tokens to tokenize, where rare tokens are at the bottom and common at the top
name_weight = .75 #note that this is really .75 * 4 because there are 4 org name simularity metrics
state_weight = 1
zip_weight = 1
phone_weight = 2
composite_score_min = 3.5 #minimum composite match score to be considered a match

start_time = time.time()
print "LOADING INITIAL DATAFRAMES..."

left_df = pd.read_csv('all cupola orgs w phone_utf.csv',keep_default_na=False)
right_df = pd.read_table('all_mint_orgs_2_19_2019_utf.txt',keep_default_na=False,error_bad_lines=False)

left_df.rename(columns={l_id:'id',l_name:'l_org_name',l_address1:'l_address1',l_city:'l_city',l_state:'l_state',l_zip:'l_postal_code',l_web:'l_web',l_phone:'l_phone',l_acronym:'l_acronym',l_alt_name:'l_alt_name'}, inplace=True)
right_df.rename(columns={r_id:'id',r_name:'r_org_name',r_address1:'r_address1',r_city:'r_city',r_state:'r_state',r_zip:'r_postal_code',r_web:'r_web',r_phone:'r_phone',r_acronym:'r_acronym'}, inplace=True)

print("Dataframes loaded --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "PRE-PROCESSING..."
#normalize state codes
state_lkup = pd.read_csv('state_lkup.csv',keep_default_na=False)

from collections import defaultdict
state_dict = defaultdict(list)
for state, acronym in zip(state_lkup.state.values,state_lkup.acronym.values):
    state_dict[state].append(acronym)

left_df.l_state = left_df.l_state.str.lower()
left_df.l_state = left_df.l_state.replace(state_dict)
right_df.r_state = right_df.r_state.str.lower()
right_df.r_state = right_df.r_state.replace(state_dict)

#clean up non numeric characters in phones
l_clean_phones = []
for phone in left_df.l_phone:
    l_clean_phones.append(re.sub('[^0-9]','', phone))
    
left_df['l_clean_phone'] = l_clean_phones

r_clean_phones = []
for phone in right_df.r_phone:
    r_clean_phones.append(re.sub('[^0-9]','', phone))

right_df['r_clean_phone'] = r_clean_phones
    
print("states, phones normalized --- %s seconds ---" % (time.time() - start_time))
print ""


start_time = time.time()
print "TOKENIZING, IDENTIFYING CANDIDATE MATCH PAIRS..."

left_unique_token_columns = [
    'l_acronym',
    'l_state', 
    'l_postal_code',
    'l_clean_phone'
]

left_delta_token_columns = [
    'l_org_name',
    'l_alt_name',
    #'l_address1',
    #'l_address2',
    'l_city', 
    'l_web'
]

right_unique_token_columns = [
    'r_acronym',
    'r_state', 
    'r_postal_code',
    'r_clean_phone'
]

right_delta_token_columns = [
    'r_org_name',
    #'r_alt_name',
    #'r_address1',
    #'r_address2',
    'r_city', 
    'r_web'
]

# lowercase the name and split on spaces, remove non-alphanumeric chars
def tokenize_name(name):
    if isinstance(name, basestring) is True:
        clean_name = ''.join(c if c.isalnum() else ' ' for c in name)
        return clean_name.lower().split()
    else:
        return name
    
unique_tokens = []    
for col in left_unique_token_columns:
    for word in left_df[col]:
        if isinstance(word, float) is False:
            unique_tokens.append(tokenize_name(str(word)))
            
for col in right_unique_token_columns:
    for word in right_df[col]:
        if isinstance(word, float) is False:
            unique_tokens.append(tokenize_name(str(word)))
            
unique_flat_list = [item for sublist in unique_tokens for item in sublist]

#instantiate counter and use to count word frequencies in flat list
u_cnt = Counter()
for token in unique_flat_list:
    u_cnt[token] += 1

u_cnt_dict = dict(u_cnt) #convert to dictionary

unique_tokens_df = pd.DataFrame(u_cnt_dict.items(), columns=['token', 'count'])
unique_tokens_df = unique_tokens_df.sort_values(by='count')  #sorting by count so that we can take the first x% of tokens by rare frequency

unique_token_flag = []
for index, value in enumerate(unique_tokens_df['count']):
    if value == 1:
        unique_token_flag.append(0)  #for any tokens occuring only once, we exclude
    else:
        unique_token_flag.append(1)

unique_tokens_df['flag'] = unique_token_flag        

all_other_words = []
for col in left_delta_token_columns:
    for word in left_df[col]:
        if isinstance(word, float) is False:
            all_other_words.append(tokenize_name(str(word)))
            
for col in right_delta_token_columns:
    for word in right_df[col]:
        if isinstance(word, float) is False:
            all_other_words.append(tokenize_name(str(word)))
            
flat_list = [item for sublist in all_other_words for item in sublist] #flatten list so it can be counted

#instantiate counter and use to count word frequencies in flat list
cnt = Counter()
for token in flat_list:
    cnt[token] += 1

cnt_dict = dict(cnt) #convert to dictionary

main_tokens_df = pd.DataFrame(cnt_dict.items(), columns=['token', 'count'])
main_tokens_df = main_tokens_df.sort_values(by='count')  #sorting by count so that we can take the first x% of tokens by rare frequency

main_token_flag = []
for index, value in enumerate(main_tokens_df['count']):
    if value == 1:
        main_token_flag.append(0)  #for any tokens occuring only once, we exclude
    elif index < int(main_tokens_df.shape[0] * token_limiter): #important line, we are cutting the top x% of frequently occuring tokens
        main_token_flag.append(1)
    else:
        main_token_flag.append(0)  #for the most common tokens, we exclude

main_tokens_df['flag'] = main_token_flag

all_tokens = pd.concat([unique_tokens_df, main_tokens_df])

all_tokens.drop('count',axis=1,inplace=True)
all_tokens['flag'] = all_tokens.flag.astype(int) #converting flags to int
tokens_dct = all_tokens.to_dict('split') #converting tokens_df to dictionary
tokens_dct=dict(tokens_dct['data']) #honestly can't remember why this works, something to do with conversion to dictionary

#preparing token_ids which will be used for joining left and right dfs
all_tokens.sort_values(by='flag',ascending=False,inplace=True)
all_tokens.drop_duplicates(subset='token',keep='first',inplace=True)
token_ids = all_tokens.index.get_level_values(0)
all_tokens['token_id'] = token_ids

all_tokens.drop('flag',axis=1,inplace=True)
all_tokens['token_id'] = all_tokens.token_id.astype(int)
token_id_dct = all_tokens.to_dict('split')
tokens_id_dct=dict(token_id_dct['data'])

vocabulary = np.array([w for w, c in tokens_dct.items() if c ==1]) #this works even without the ==1 and I don't know why
cv = CountVectorizer( vocabulary=vocabulary)

#now we are ready to tokenize left and right dataframes
all_left_cols = left_unique_token_columns + left_delta_token_columns

left_frame_list = []
for colname in all_left_cols:
    tokenmapping = cv.fit_transform(left_df[colname])
    df_row, token_id = tokenmapping.nonzero()

    left_frame_list.append(pd.DataFrame(np.vstack([vocabulary[token_id], left_df['id'].values[df_row]]).T, columns = ['token', 'id_l']))

left_keyed = pd.concat(left_frame_list)
left_keyed.drop_duplicates(inplace=True)

#append token_id to token as this will be more efficient to join with
left_token_ids = []
for token in left_keyed.token:
    left_token_ids.append(tokens_id_dct[token])

left_keyed['token_id'] = left_token_ids
left_keyed.sort_values(by='token_id',inplace=True)
left_keyed.set_index('token_id',inplace=True)
left_keyed.drop('token',axis=1,inplace=True)

all_right_cols = right_unique_token_columns + right_delta_token_columns

right_frame_list = []
for colname in all_right_cols:
    tokenmapping = cv.fit_transform(right_df[colname])
    df_row, token_id = tokenmapping.nonzero()

    right_frame_list.append(pd.DataFrame(np.vstack([vocabulary[token_id], right_df['id'].values[df_row]]).T, columns = ['token', 'id_r']))

right_keyed = pd.concat(right_frame_list)
right_keyed.drop_duplicates(inplace=True)

#append token_id to token as this will be more efficient to join with
right_token_ids = []
for token in right_keyed.token:
    right_token_ids.append(tokens_id_dct[token])

right_keyed['token_id'] = right_token_ids
right_keyed.sort_values(by='token_id',inplace=True)
right_keyed.set_index('token_id',inplace=True)
right_keyed.drop('token',axis=1,inplace=True)

aggregations = {
    'id_l': 'count'
}

joined = left_keyed.join(right_keyed, how='inner',lsuffix='_l',rsuffix='_r')
keys_grouped = joined.groupby(by=['id_l', 'id_r']).agg(aggregations)
keys_grouped.rename(columns={'id_l':'id_l count'}, inplace=True)
matched_records = keys_grouped[keys_grouped['id_l count'] >= token_match_min]
matched_records.reset_index(inplace=True)

print("match candidates identified --- %s seconds ---" % (time.time() - start_time))

left_df.rename(columns={'id':'id_l'},inplace=True)
right_df.rename(columns={'id':'id_r'},inplace=True)

left_match_data = left_df[['id_l','l_org_name','l_city','l_state','l_postal_code','l_web','l_clean_phone']].copy()
right_match_data = right_df[['id_r','r_org_name','r_city','r_state','r_postal_code','r_web','r_clean_phone']].copy()

#making sure keys are str, results in blank df otherwise
left_match_data.id_l = left_match_data.id_l.astype('str')
right_match_data.id_r = right_match_data.id_r.astype('str')
matched_records.id_l = matched_records.id_l.astype('str')
matched_records.id_r = matched_records.id_r.astype('str')

#merging matched_records df with original record data for ease of review
l_conc = pd.merge(matched_records, left_match_data, on='id_l')
full_conc = pd.merge(l_conc, right_match_data, on='id_r')

print("original data concatenated with matches --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "SCORING ORG NAME SIMULARITY..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#scoring match candidates based on edit distance of org names
def jaro_simularity(left_record, right_record):
    if len(left_record) > 0 and len(right_record) > 0:
        if isinstance(left_record, numbers.Integral) is False and isinstance(right_record, numbers.Integral) is False:
            return jaro_winkler(unicode(left_record, 'utf-8') or '', unicode(right_record, 'utf-8') or '')
def fuzz_partial(left_record, right_record):
    if len(left_record) > 0 and len(right_record) > 0:
        if isinstance(left_record, numbers.Integral) is False and isinstance(right_record, numbers.Integral) is False:
            return fuzz.partial_ratio(unicode(left_record, 'utf-8') or '', unicode(right_record, 'utf-8') or '') / float(100)
def fuzz_sort(left_record, right_record):
    if len(left_record) > 0 and len(right_record) > 0:
        if isinstance(left_record, numbers.Integral) is False and isinstance(right_record, numbers.Integral) is False:
            return fuzz.token_sort_ratio(unicode(left_record, 'utf-8') or '', unicode(right_record, 'utf-8') or '') / float(100)
def fuzz_set(left_record, right_record):
    if len(left_record) > 0 and len(right_record) > 0:
        if isinstance(left_record, numbers.Integral) is False and isinstance(right_record, numbers.Integral) is False:
            return fuzz.token_set_ratio(unicode(left_record, 'utf-8') or '', unicode(right_record, 'utf-8') or '') / float(100)

full_conc['l_org_name'] = full_conc['l_org_name'].astype('str')
full_conc['r_org_name'] = full_conc['r_org_name'].astype('str')

jaro_time = time.time()
full_conc['jaro_score'] = full_conc.apply(lambda x: jaro_simularity(x.l_org_name, x.r_org_name), axis=1)
print("jaro scores done --- %s seconds ---" % (time.time() - jaro_time))
partial_time = time.time()
full_conc['fuzz_partial_score'] = full_conc.apply(lambda x: fuzz_partial(x.l_org_name, x.r_org_name), axis=1)
print("fuzz partial scores done --- %s seconds ---" % (time.time() - partial_time))
sort_time = time.time()
full_conc['fuzz_sort_score'] = full_conc.apply(lambda x: fuzz_sort(x.l_org_name, x.r_org_name), axis=1)
print("fuzz sort scores done --- %s seconds ---" % (time.time() - sort_time))
set_time = time.time()
full_conc['fuzz_set_score'] = full_conc.apply(lambda x: fuzz_set(x.l_org_name, x.r_org_name), axis=1)
print("fuzz set scores done --- %s seconds ---" % (time.time() - set_time))
print ""

print("name simularity scored --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "CHECKING FOR STATE CODE MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

def sanitize_state(state):
    if isinstance(state,basestring) is True:
        return ''.join(c for c in (state or '') if c in 'abcdefghijklmnopqrstuvwxyz')
    else:
        return ''
    
def state_match(state_a, state_b):
    sanitized_state_a = str(sanitize_state(state_a))
    sanitized_state_b = str(sanitize_state(state_b))

    # if the value is too short, means it's fubar
    if len(sanitized_state_a) < 2 or len(sanitized_state_b) < 2:
        return 0
    if state_a == state_b:
        return 1
    else:
        return 0    

full_conc['state_match'] = full_conc.apply(lambda x: state_match(x.l_state, x.r_state), axis=1)

print("state codes checked --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "CHECKING FOR POSTAL CODE MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#scoring match candidates based on matching postal code

def sanitize_postal(postal):
    if isinstance(postal, basestring) is True:
        return ''.join(c for c in (postal or '') if c in '1234567890')
    if isinstance(postal, float) is False:
        return postal

def postal_simularity(postal_a, postal_b):
    sanitized_postal_a = str(sanitize_postal(postal_a))
    sanitized_postal_b = str(sanitize_postal(postal_b))

    # if the number is too short, means it's fubar
    if len(sanitized_postal_a) < 5 or len(sanitized_postal_b) < 5:
        return 0
    if float(max(len(sub) for sub in find_common_subsequences(sanitized_postal_a, sanitized_postal_b))) / 5 >= 1:
        return 1
    else:
        return 0
    
full_conc['zip_match'] = full_conc.apply(lambda x: postal_simularity(x.l_postal_code, x.r_postal_code), axis=1)
    
print("postal codes checked --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "CHECKING FOR PHONE MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#scoring match candidates based on matching phone
def phone_simularity(phone_a, phone_b):

#if the number is too short, means it's fubar
    if len(phone_a) < 10 or len(phone_b) < 10:
        return 0
    if float(max(len(sub) for sub in find_common_subsequences(phone_a, phone_b))) / 10 >= 1:
        return 1
    else:
        return 0
    
full_conc['phone_match'] = full_conc.apply(lambda x: phone_simularity(x.l_clean_phone, x.r_clean_phone), axis=1)
    
print("phones checked --- %s seconds ---" % (time.time() - start_time))
print ""

#test this.  may need to make more efficient but I think it should work
start_time = time.time()
print "DISTILLING STRONG ORG DUPLICATES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#calculate composite match score based on component scores and weights
full_conc['composite_match_score'] = full_conc.jaro_score * name_weight \
+ full_conc.fuzz_partial_score * name_weight \
+ full_conc.fuzz_sort_score * name_weight \
+ full_conc.fuzz_set_score * name_weight \
+ full_conc.zip_match * zip_weight \
+ full_conc.state_match * state_weight \
+ full_conc.phone_match * phone_weight

org_matches = full_conc[full_conc.composite_match_score >= composite_score_min]

print("final duplicates isolated --- %s seconds ---" % (time.time() - start_time))
print ""

#full_conc[full_conc.composite_match_score < 3].sort_values(by='composite_match_score', ascending=False)
org_matches.sort_values(by='composite_match_score', ascending=False)

LOADING INITIAL DATAFRAMES...


Skipping line 944: expected 12 fields, saw 13
Skipping line 1901: expected 12 fields, saw 13
Skipping line 2385: expected 12 fields, saw 13
Skipping line 9323: expected 12 fields, saw 13
Skipping line 12962: expected 12 fields, saw 13
Skipping line 20622: expected 12 fields, saw 13
Skipping line 22166: expected 12 fields, saw 13
Skipping line 22459: expected 12 fields, saw 13
Skipping line 23323: expected 12 fields, saw 13
Skipping line 24264: expected 12 fields, saw 13
Skipping line 24413: expected 12 fields, saw 13
Skipping line 24984: expected 12 fields, saw 13
Skipping line 25965: expected 12 fields, saw 13
Skipping line 26570: expected 12 fields, saw 13
Skipping line 26617: expected 12 fields, saw 13
Skipping line 29375: expected 12 fields, saw 13
Skipping line 31384: expected 12 fields, saw 13
Skipping line 32471: expected 12 fields, saw 13
Skipping line 36022: expected 12 fields, saw 13
Skipping line 37449: expected 12 fields, saw 13
Skipping line 37651: expected 12 fields, saw 

Dataframes loaded --- 0.478000164032 seconds ---

PRE-PROCESSING...
states, phones normalized --- 1.27999997139 seconds ---

TOKENIZING, IDENTIFYING CANDIDATE MATCH PAIRS...
match candidates identified --- 81.861000061 seconds ---
original data concatenated with matches --- 82.9360001087 seconds ---

SCORING ORG NAME SIMULARITY...
jaro scores done --- 7.08500003815 seconds ---
fuzz partial scores done --- 13.2869999409 seconds ---
fuzz sort scores done --- 9.62400007248 seconds ---
fuzz set scores done --- 12.2149999142 seconds ---

name simularity scored --- 42.2509999275 seconds ---

CHECKING FOR STATE CODE MATCHES...
state codes checked --- 4.79999995232 seconds ---

CHECKING FOR POSTAL CODE MATCHES...
postal codes checked --- 15.1150000095 seconds ---

CHECKING FOR PHONE MATCHES...
phones checked --- 47.8380000591 seconds ---

DISTILLING STRONG ORG DUPLICATES...
final duplicates isolated --- 0.12299990654 seconds ---



Unnamed: 0,id_l,id_r,id_l count,l_org_name,l_city,l_state,l_postal_code,l_web,l_clean_phone,r_org_name,...,r_web,r_clean_phone,jaro_score,fuzz_partial_score,fuzz_sort_score,fuzz_set_score,state_match,zip_match,phone_match,composite_match_score
66342,50430,5324,5,Glass Art Society,Seattle,wa,98107,glassart.org,2063821305,Glass Art Society,...,glassart.org,2063821305,1.000000,1.00,1.00,1.00,1,1,1,7.000000
75306,54951,39304,6,American Institute of Architects - Illinois,Springfield,il,62701-1323,aiail.org,2175222309,American Institute of Architects - Illinois,...,aiail.org,2175222309,1.000000,1.00,1.00,1.00,1,1,1,7.000000
75304,54949,39302,3,American Institute of Architects - California ...,Sacramento,ca,95814,aiacc.org,9164489082,American Institute of Architects - California ...,...,aiacc.org,9164489082,1.000000,1.00,1.00,1.00,1,1,1,7.000000
75298,54948,39301,6,American College of Emergency Physicians New Y...,Webster,ny,14580-2986,nyacep.org,5858722417,American College of Emergency Physicians New Y...,...,nyacep.org,5858722417,1.000000,1.00,1.00,1.00,1,1,1,7.000000
75297,54947,39300,4,American College of Emergency Physicians - Ohi...,Columbus,oh,43235,ohacep.org,6147926506,American College of Emergency Physicians - Ohi...,...,ohacep.org,6147926506,1.000000,1.00,1.00,1.00,1,1,1,7.000000
75296,54946,39299,4,American College of Emergency Physicians - Ind...,Carmel,in,46032,inacep.org,3178462977,American College of Emergency Physicians - Ind...,...,inacep.org,3178462977,1.000000,1.00,1.00,1.00,1,1,1,7.000000
75295,54945,39298,5,Florida College of Emergency Physicians,Orlando,fl,32812-7607,emlrc.org/fcep/,4072817396,Florida College of Emergency Physicians,...,emlrc.org/fcep/,4072817396,1.000000,1.00,1.00,1.00,1,1,1,7.000000
75293,54941,39295,6,Aluminum Association of Florida,Orlando,fl,32803-2935,www.aaof.org,4078988286,Aluminum Association of Florida,...,www.aaof.org,4078988286,1.000000,1.00,1.00,1.00,1,1,1,7.000000
75292,54940,39294,6,Alaska Power Association,Anchorage,ak,99503-6650,alaskapower.org,9077715700,Alaska Power Association,...,alaskapower.org,9077715700,1.000000,1.00,1.00,1.00,1,1,1,7.000000
75289,54935,39292,5,Alabama Association for Justice,Montgomery,al,36101,alabamajustice.org,3342624974,Alabama Association for Justice,...,alabamajustice.org,3342624974,1.000000,1.00,1.00,1.00,1,1,1,7.000000


In [178]:
#calculate composite match score based on component scores and weights
full_conc['composite_match_score2'] = (full_conc.jaro_score + full_conc.fuzz_partial_score + full_conc.fuzz_sort_score \
+ full_conc.fuzz_set_score)/4 * name_weight \
+ full_conc.zip_match * zip_weight \
+ full_conc.state_match * state_weight \
+ full_conc.phone_match * phone_weight


In [179]:
full_conc.head()

Unnamed: 0,id_l,id_r,id_l count,l_org_name,l_city,l_state,l_postal_code,l_web,l_clean_phone,r_org_name,...,r_clean_phone,jaro_score,fuzz_partial_score,fuzz_sort_score,fuzz_set_score,state_match,zip_match,phone_match,composite_match_score,composite_match_score2
0,1000,10224,3,Miami University of Ohio,Oxford,oh,45056,www.muohio.edu,5135291809.0,PHI DELTA THETA FRATERNITY,...,5135236345,0.413462,0.13,0.36,0.36,1,1,0,2.947596,2.236899
1,36293,10224,3,Learning Forward,Oxford,oh,45056,learningforward.org,5135236029.0,PHI DELTA THETA FRATERNITY,...,5135236345,0.323184,0.19,0.33,0.33,1,1,0,2.879888,2.219972
2,36293,10224,3,Learning Forward,Alexandria,va,22302,learningforward.org,5135236029.0,PHI DELTA THETA FRATERNITY,...,5135236345,0.323184,0.19,0.33,0.33,0,0,0,0.879888,0.219972
3,49533,10224,3,Academy of Legal Studies in Business,Oxford,oh,45056,alsb.org,,PHI DELTA THETA FRATERNITY,...,5135236345,0.310399,0.15,0.29,0.29,1,1,0,2.780299,2.195075
4,50306,10224,3,Delta Sigma Pi,Oxford,oh,45056-2405,deltasigmapi.org,5135231907.0,PHI DELTA THETA FRATERNITY,...,5135236345,0.39652,0.21,0.5,0.53,1,1,0,3.22739,2.306848


In [170]:
len(full_conc.iloc[58702].r_org_name)

0

In [129]:
type('')

str

In [167]:
full_conc.isnull().sum()

id_l             0
id_r             0
id_l count       0
l_org_name       0
l_city           0
l_state          0
l_postal_code    0
l_web            0
l_clean_phone    0
r_org_name       0
r_city           0
r_state          0
r_postal_code    0
r_web            0
r_clean_phone    0
dtype: int64

In [135]:
left_df.l_phone

0                #N/A
1        609-771-0101
2                #N/A
3                #N/A
4                #N/A
5                #N/A
6                #N/A
7                #N/A
8                #N/A
9        847-605-6000
10       800-987-3373
11       202-585-3100
12               #N/A
13               #N/A
14       978-692-4900
15       702-221-4780
16       202-298-8660
17       202-778-1800
18               #N/A
19               #N/A
20       972-671-8885
21               #N/A
22               #N/A
23               #N/A
24       407-644-6300
25       512-364-0656
26               #N/A
27               #N/A
28               #N/A
29               #N/A
             ...     
88487    212-922-1500
88488    443-391-7235
88489    504-522-4850
88490            #N/A
88491    408-536-6000
88492            #N/A
88493    407-581-1560
88494    862-261-7000
88495    319-337-1000
88496    405-737-2676
88497    240-450-0075
88498    410-347-7700
88499    408-400-1900
88500    484-653-3300
88501    9

In [138]:
left_reduced = left_df.head(1000).copy()

In [153]:
start_time = time.time()
print "CHECKING FOR PHONE MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#clean up non numeric characters in phones
def sanitize_phone(phone):
    if isinstance(phone, basestring) is True:
        return ''.join(c for c in (phone or '') if c in '1234567890')
    if isinstance(phone, float) is False:
        return phone
    
for phone in left_reduced.l_phone:
    left_reduced.l_phone.replace(sanitize_phone(phone))
    
print("phones checked --- %s seconds ---" % (time.time() - start_time))
print ""

CHECKING FOR PHONE MATCHES...
phones checked --- 0.444000005722 seconds ---



In [160]:
start_time = time.time()
print "CHECKING FOR PHONE MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#clean up non numeric characters in phones
for phone in left_df.l_phone:
    left_df.l_phone.replace(re.sub('[^0-9]','', phone))
    
print("phones checked --- %s seconds ---" % (time.time() - start_time))
print ""

CHECKING FOR PHONE MATCHES...
phones checked --- 0.408999919891 seconds ---



In [161]:
start_time = time.time()
print "CHECKING FOR PHONE MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

clean_phones = []
for phone in left_df.l_phone:
    clean_phones.append(re.sub('[^0-9]','', phone))
    
left_df['new_phone'] = clean_phones

print("phones checked --- %s seconds ---" % (time.time() - start_time))
print ""

CHECKING FOR PHONE MATCHES...
phones checked --- 0.319999933243 seconds ---



In [162]:
left_df.head()

Unnamed: 0,id_l,l_org_name,l_acronym,l_address1,address2,l_city,l_state,l_postal_code,l_alt_name,l_web,l_phone,new_phone
0,3640,Zyvex Performance,,,,Columbus,oh,,,www.zyvexpro.com,,
1,95526,Zytron,,20 Lexington Ave.,,Trenton,nj,8618.0,,www.zytron.com,609-771-0101,6097710101.0
2,70481,"Zynga, Inc.",,699 Eighth St.,,San Francisco,ca,94103.0,,,,
3,70482,Zynga (a client of Bay Bridge Strategies),,1300 Connecticut Ave. NW,Suite 600,Washington,dc,20036.0,,,,
4,81106,Zynerba Pharmaceuticals Inc,,80 W. Lancaster Ave.,Suite 300,Devon,pa,19333.0,,,,


In [95]:
duplicate_candidates.shape

(402544, 3)

In [92]:
duplicate_candidates.drop_duplicates(subset=('id_l','id_r'),inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [84]:
tokens_dct['corporation']

KeyError: 'corporation'

In [87]:
main_tokens_df

Unnamed: 0,token,count,flag


In [41]:
left_keyed.shape

(699341, 1)

In [42]:
right_keyed.shape

(355793, 1)

In [43]:
left_keyed.head()

Unnamed: 0_level_0,id_l
token_id,Unnamed: 1_level_1
0,64174
0,64174
1,90066
2,89826
2,89826


In [44]:
#define column names
l_id = 'organization_id'
l_name = 'org_name'
l_address1 = 'address1'
l_city = 'city'
l_state = 'state'
l_zip = 'postal_code'
l_phone = 'phone'
l_web = 'website'
l_acronym = 'acronym'
l_alt_name = 'alt_name'

r_id = 'rec_id'
r_name = 'org_name'
r_address1 = 'street1'
r_city = 'CITY'
r_state = 'STPROV'
r_zip = 'zip'
r_phone = 'PHONE'
r_web = 'url'
r_acronym = 'CBI_Acronym'

#set parameters
token_match_min = 2 # minimum number of matched tokens to be considered a match
token_limiter = .999 # percent of non-single tokens to tokenize, where rare tokens are at the bottom and common at the top
name_weight = .75 #note that this is really .75 * 4 because there are 4 org name simularity metrics
state_weight = 1
zip_weight = 1
phone_weight = 1
composite_score_min = 3.5 #minimum composite match score to be considered a match

start_time = time.time()
print "LOADING INITIAL DATAFRAMES..."

left_df = pd.read_csv('all cupola orgs w phone_utf.csv',keep_default_na=False)
right_df = pd.read_table('all_mint_orgs_2_19_2019_utf.txt',keep_default_na=False,error_bad_lines=False)

left_df.rename(columns={l_id:'id',l_name:'l_org_name',l_address1:'l_address1',l_city:'l_city',l_state:'l_state',l_zip:'l_postal_code',l_web:'l_web',l_phone:'l_phone',l_acronym:'l_acronym',l_alt_name:'l_alt_name'}, inplace=True)
right_df.rename(columns={r_id:'id',r_name:'r_org_name',r_address1:'r_address1',r_city:'r_city',r_state:'r_state',r_zip:'r_postal_code',r_web:'r_web',r_phone:'r_phone',r_acronym:'r_acronym'}, inplace=True)

print("Dataframes loaded --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "PRE-PROCESSING: NORMALIZE STATES..."
#normalize state codes
state_lkup = pd.read_csv('state_lkup.csv',keep_default_na=False)

from collections import defaultdict
state_dict = defaultdict(list)
for state, acronym in zip(state_lkup.state.values,state_lkup.acronym.values):
    state_dict[state].append(acronym)

left_df.l_state = left_df.l_state.str.lower()
left_df.l_state = left_df.l_state.replace(state_dict)
right_df.r_state = right_df.r_state.str.lower()
right_df.r_state = right_df.r_state.replace(state_dict)

print("states normalized --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "TOKENIZING, IDENTIFYING CANDIDATE MATCH PAIRS..."

left_unique_token_columns = [
    'l_acronym',
    'l_state', 
    'l_postal_code',
    'l_phone'
]

left_delta_token_columns = [
    'l_org_name',
    'l_alt_name',
    #'l_address1',
    #'l_address2',
    'l_city', 
    'l_web'
]

right_unique_token_columns = [
    'r_acronym',
    'r_state', 
    'r_postal_code',
    'r_phone'
]

right_delta_token_columns = [
    'r_org_name',
    #'r_alt_name',
    #'r_address1',
    #'r_address2',
    'r_city', 
    'r_web'
]

# lowercase the name and split on spaces, remove non-alphanumeric chars
def tokenize_name(name):
    if isinstance(name, basestring) is True:
        clean_name = ''.join(c if c.isalnum() else ' ' for c in name)
        return clean_name.lower().split()
    else:
        return name
    
unique_tokens = []    
for col in left_unique_token_columns:
    for word in left_df[col]:
        if isinstance(word, float) is False:
            unique_tokens.append(tokenize_name(str(word)))
            
for col in right_unique_token_columns:
    for word in right_df[col]:
        if isinstance(word, float) is False:
            unique_tokens.append(tokenize_name(str(word)))
            
unique_flat_list = [item for sublist in unique_tokens for item in sublist]

#instantiate counter and use to count word frequencies in flat list
u_cnt = Counter()
for token in unique_flat_list:
    u_cnt[token] += 1

u_cnt_dict = dict(u_cnt) #convert to dictionary

unique_tokens_df = pd.DataFrame(u_cnt_dict.items(), columns=['token', 'count'])
unique_tokens_df = unique_tokens_df.sort_values(by='count')  #sorting by count so that we can take the first x% of tokens by rare frequency

unique_token_flag = []
for index, value in enumerate(unique_tokens_df['count']):
    if value == 1:
        unique_token_flag.append(0)  #for any tokens occuring only once, we exclude
    else:
        unique_token_flag.append(1)

unique_tokens_df['flag'] = unique_token_flag        

all_other_words = []
for col in left_delta_token_columns:
    for word in left_df[col]:
        if isinstance(word, float) is False:
            unique_tokens.append(tokenize_name(str(word)))
            
for col in right_delta_token_columns:
    for word in right_df[col]:
        if isinstance(word, float) is False:
            unique_tokens.append(tokenize_name(str(word)))
            
flat_list = [item for sublist in all_other_words for item in sublist] #flatten list so it can be counted

#instantiate counter and use to count word frequencies in flat list
cnt = Counter()
for token in flat_list:
    cnt[token] += 1

cnt_dict = dict(cnt) #convert to dictionary

main_tokens_df = pd.DataFrame(cnt_dict.items(), columns=['token', 'count'])
main_tokens_df = main_tokens_df.sort_values(by='count')  #sorting by count so that we can take the first x% of tokens by rare frequency

main_token_flag = []
for index, value in enumerate(main_tokens_df['count']):
    if value == 1:
        main_token_flag.append(0)  #for any tokens occuring only once, we exclude
    elif index < int(main_tokens_df.shape[0] * token_limiter): #important line, we are cutting the top x% of frequently occuring tokens
        main_token_flag.append(1)
    else:
        main_token_flag.append(0)  #for the most common tokens, we exclude

main_tokens_df['flag'] = main_token_flag

all_tokens = pd.concat([unique_tokens_df, main_tokens_df])

all_tokens.drop('count',axis=1,inplace=True)
all_tokens['flag'] = all_tokens.flag.astype(int) #converting flags to int
tokens_dct = all_tokens.to_dict('split') #converting tokens_df to dictionary
tokens_dct=dict(tokens_dct['data']) #honestly can't remember why this works, something to do with conversion to dictionary

#preparing token_ids which will be used for joining left and right dfs
all_tokens.sort_values(by='flag',ascending=False,inplace=True)
all_tokens.drop_duplicates(subset='token',keep='first',inplace=True)
token_ids = all_tokens.index.get_level_values(0)
all_tokens['token_id'] = token_ids

all_tokens.drop('flag',axis=1,inplace=True)
all_tokens['token_id'] = all_tokens.token_id.astype(int)
token_id_dct = all_tokens.to_dict('split')
tokens_id_dct=dict(token_id_dct['data'])

vocabulary = np.array([w for w, c in tokens_dct.items() if c ==1]) #this works even without the ==1 and I don't know why
cv = CountVectorizer( vocabulary=vocabulary)

#now we are ready to tokenize left and right dataframes
all_left_cols = left_unique_token_columns + left_delta_token_columns

left_frame_list = []
for colname in all_left_cols:
    tokenmapping = cv.fit_transform(left_df[colname])
    df_row, token_id = tokenmapping.nonzero()

    left_frame_list.append(pd.DataFrame(np.vstack([vocabulary[token_id], left_df['id'].values[df_row]]).T, columns = ['token', 'id_l']))

left_keyed = pd.concat(left_frame_list)

#append token_id to token as this will be more efficient to join with
left_token_ids = []
for token in left_keyed.token:
    left_token_ids.append(tokens_id_dct[token])

left_keyed['token_id'] = left_token_ids

LOADING INITIAL DATAFRAMES...


Skipping line 944: expected 12 fields, saw 13
Skipping line 1901: expected 12 fields, saw 13
Skipping line 2385: expected 12 fields, saw 13
Skipping line 9323: expected 12 fields, saw 13
Skipping line 12962: expected 12 fields, saw 13
Skipping line 20622: expected 12 fields, saw 13
Skipping line 22166: expected 12 fields, saw 13
Skipping line 22459: expected 12 fields, saw 13
Skipping line 23323: expected 12 fields, saw 13
Skipping line 24264: expected 12 fields, saw 13
Skipping line 24413: expected 12 fields, saw 13
Skipping line 24984: expected 12 fields, saw 13
Skipping line 25965: expected 12 fields, saw 13
Skipping line 26570: expected 12 fields, saw 13
Skipping line 26617: expected 12 fields, saw 13
Skipping line 29375: expected 12 fields, saw 13
Skipping line 31384: expected 12 fields, saw 13
Skipping line 32471: expected 12 fields, saw 13
Skipping line 36022: expected 12 fields, saw 13
Skipping line 37449: expected 12 fields, saw 13
Skipping line 37651: expected 12 fields, saw 

Dataframes loaded --- 0.944999933243 seconds ---

PRE-PROCESSING: NORMALIZE STATES...
states normalized --- 1.28299999237 seconds ---

TOKENIZING, IDENTIFYING CANDIDATE MATCH PAIRS...


In [53]:
left_keyed.head()

Unnamed: 0,token,id_l
0,,3640
1,,70481
2,,70482
3,,81106
4,,38836


In [55]:
left_keyed.shape

(572708, 2)

In [51]:
left_keyed = pd.concat(left_frame_list)

In [54]:
left_keyed.drop_duplicates(inplace=True)