In [1]:
import pandas as pd
import numpy as np
import math
from jellyfish import jaro_winkler
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from py_common_subseq import find_common_subsequences
import numbers
import time
from collections import Counter 
from fuzzywuzzy import fuzz
import re

In [2]:
asae = pd.read_table('asae.txt')

In [189]:
import io
import shutil

with io.open('all cupola orgs w phone_db.csv', encoding='utf-8', errors='ignore') as source:
    with io.open('all cupola orgs w phone_db_utf.csv', mode='w', encoding='utf-8') as target:
        shutil.copyfileobj(source,target)
        
with io.open('ASAE neighborworks test.txt', encoding='utf-8', errors='ignore') as source:
    with io.open('asae_utf.txt', mode='w', encoding='utf-8') as target:
        shutil.copyfileobj(source,target)

In [2]:
#define column names
l_id = 'organization_id'
l_name = 'org_name'
l_address1 = 'address1'
l_city = 'city'
l_state = 'state'
l_zip = 'postal_code'
l_phone = 'phone'
l_web = 'website'
l_acronym = 'acronym'
l_alt_name = 'alt_name'

r_id = 'CompanyID'
r_name = 'Name'
r_address1 = 'Address Line 1'
r_city = 'City'
r_state = 'State'
r_zip = 'Zip Code'
#r_phone = 'PHONE'
r_web = 'WebSite'
#r_acronym = 'CBI_Acronym'

#set parameters
token_match_min = 2 # minimum number of matched tokens to be considered a match
token_limiter = .999 # percent of non-single tokens to tokenize, where rare tokens are at the bottom and common at the top
name_weight = .75 #note that this is really .75 * 4 because there are 4 org name simularity metrics
state_weight = 1
zip_weight = 1
phone_weight = 2
composite_score_min = 3.5 #minimum composite match score to be considered a match

start_time = time.time()
print "LOADING INITIAL DATAFRAMES..."

left_df = pd.read_csv('all cupola orgs w phone_db_utf.csv',keep_default_na=False)
right_df = pd.read_table('asae_utf.txt',keep_default_na=False,error_bad_lines=False)

left_df.rename(columns={l_id:'id',l_name:'l_org_name',l_address1:'l_address1',l_city:'l_city',l_state:'l_state',l_zip:'l_postal_code',l_web:'l_web',l_phone:'l_phone',l_acronym:'l_acronym',l_alt_name:'l_alt_name'}, inplace=True)
right_df.rename(columns={r_id:'id',r_name:'r_org_name',r_address1:'r_address1',r_city:'r_city',r_state:'r_state',r_zip:'r_postal_code',r_web:'r_web'}, inplace=True)

print("Dataframes loaded --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "PRE-PROCESSING..."
#normalize state codes
state_lkup = pd.read_csv('state_lkup.csv',keep_default_na=False)

from collections import defaultdict
state_dict = defaultdict(list)
for state, acronym in zip(state_lkup.state.values,state_lkup.acronym.values):
    state_dict[state].append(acronym)

left_df.l_state = left_df.l_state.str.lower()
left_df.l_state = left_df.l_state.replace(state_dict)
right_df.r_state = right_df.r_state.str.lower()
right_df.r_state = right_df.r_state.replace(state_dict)

#clean up non numeric characters in phones
l_clean_phones = []
for phone in left_df.l_phone:
    l_clean_phones.append(re.sub('[^0-9]','', phone))
    
left_df['l_clean_phone'] = l_clean_phones

#r_clean_phones = []
#for phone in right_df.r_phone:
#    r_clean_phones.append(re.sub('[^0-9]','', phone))

#right_df['r_clean_phone'] = r_clean_phones
    
print("states, phones normalized --- %s seconds ---" % (time.time() - start_time))
print ""


start_time = time.time()
print "TOKENIZING, IDENTIFYING CANDIDATE MATCH PAIRS..."

left_unique_token_columns = [
    'l_state', 
    'l_postal_code',
    #'l_clean_phone'
]

left_delta_token_columns = [
    'l_acronym',
    'l_org_name',
    'l_alt_name',
    #'l_address1',
    #'l_address2',
    'l_city', 
    'l_web'
]

right_unique_token_columns = [
    'r_state', 
    'r_postal_code',
    #'r_clean_phone'
]

right_delta_token_columns = [
    #'r_acronym',
    'r_org_name',
    #'r_alt_name',
    #'r_address1',
    #'r_address2',
    'r_city', 
    'r_web'
]

# lowercase the name and split on spaces, remove non-alphanumeric chars
def tokenize_name(name):
    if isinstance(name, basestring) is True:
        clean_name = ''.join(c if c.isalnum() else ' ' for c in name)
        return clean_name.lower().split()
    else:
        return name
    
unique_tokens = []    
for col in left_unique_token_columns:
    for word in left_df[col]:
        if isinstance(word, float) is False:
            unique_tokens.append(tokenize_name(str(word)))
            
for col in right_unique_token_columns:
    for word in right_df[col]:
        if isinstance(word, float) is False:
            unique_tokens.append(tokenize_name(str(word)))
            
unique_flat_list = [item for sublist in unique_tokens for item in sublist]

#instantiate counter and use to count word frequencies in flat list
u_cnt = Counter()
for token in unique_flat_list:
    u_cnt[token] += 1

u_cnt_dict = dict(u_cnt) #convert to dictionary

unique_tokens_df = pd.DataFrame(u_cnt_dict.items(), columns=['token', 'count'])
unique_tokens_df = unique_tokens_df.sort_values(by='count')  #sorting by count so that we can take the first x% of tokens by rare frequency

#consider waiting to do the count flag thing later, instead use some type of "token type" code
unique_token_flag = []
for index, value in enumerate(unique_tokens_df['count']):
    if value == 1:
        unique_token_flag.append(0)  #for any tokens occuring only once, we exclude
    else:
        unique_token_flag.append(1)

unique_tokens_df['unique_flag'] = unique_token_flag        

all_other_words = []
for col in left_delta_token_columns:
    for word in left_df[col]:
        if isinstance(word, float) is False:
            all_other_words.append(tokenize_name(str(word)))
            
for col in right_delta_token_columns:
    for word in right_df[col]:
        if isinstance(word, float) is False:
            all_other_words.append(tokenize_name(str(word)))
            
flat_list = [item for sublist in all_other_words for item in sublist] #flatten list so it can be counted

#instantiate counter and use to count word frequencies in flat list
cnt = Counter()
for token in flat_list:
    cnt[token] += 1

cnt_dict = dict(cnt) #convert to dictionary

main_tokens_df = pd.DataFrame(cnt_dict.items(), columns=['token', 'count'])
main_tokens_df = main_tokens_df.sort_values(by='count')  #sorting by count so that we can take the first x% of tokens by rare frequency

#wait to do count until joined with unique tokens?
main_token_flag = []
for index, value in enumerate(main_tokens_df['count']):
    if value == 1:
        main_token_flag.append(0)  #for any tokens occuring only once, we exclude
    elif index < int(main_tokens_df.shape[0] * token_limiter): #important line, we are cutting the top x% of frequently occuring tokens
        main_token_flag.append(1)
    else:
        main_token_flag.append(0)  #for the most common tokens, we exclude

main_tokens_df['flag'] = main_token_flag

all_tokens = pd.concat([unique_tokens_df, main_tokens_df])

all_tokens.drop('count',axis=1,inplace=True)
all_tokens['flag'] = all_tokens.flag.astype(int) #converting flags to int
tokens_dct = all_tokens.to_dict('split') #converting tokens_df to dictionary
tokens_dct=dict(tokens_dct['data']) #honestly can't remember why this works, something to do with conversion to dictionary

#preparing token_ids which will be used for joining left and right dfs
all_tokens.sort_values(by='flag',ascending=False,inplace=True)
all_tokens.drop_duplicates(subset='token',keep='first',inplace=True)
token_ids = all_tokens.index.get_level_values(0)
all_tokens['token_id'] = token_ids

all_tokens.drop('flag',axis=1,inplace=True)
all_tokens['token_id'] = all_tokens.token_id.astype(int)
token_id_dct = all_tokens.to_dict('split')
tokens_id_dct=dict(token_id_dct['data'])

vocabulary = np.array([w for w, c in tokens_dct.items() if c ==1]) #this works even without the ==1 and I don't know why
cv = CountVectorizer( vocabulary=vocabulary)

#now we are ready to tokenize left and right dataframes
all_left_cols = left_unique_token_columns + left_delta_token_columns

left_frame_list = []
for colname in all_left_cols:
    tokenmapping = cv.fit_transform(left_df[colname])
    df_row, token_id = tokenmapping.nonzero()

    left_frame_list.append(pd.DataFrame(np.vstack([vocabulary[token_id], left_df['id'].values[df_row]]).T, columns = ['token', 'id_l']))

left_keyed = pd.concat(left_frame_list)
left_keyed.drop_duplicates(inplace=True)

#append token_id to token as this will be more efficient to join with
left_token_ids = []
for token in left_keyed.token:
    left_token_ids.append(tokens_id_dct[token])

left_keyed['token_id'] = left_token_ids
left_keyed.sort_values(by='token_id',inplace=True)
left_keyed.set_index('token_id',inplace=True)
left_keyed.drop('token',axis=1,inplace=True)

all_right_cols = right_unique_token_columns + right_delta_token_columns

right_frame_list = []
for colname in all_right_cols:
    tokenmapping = cv.fit_transform(right_df[colname])
    df_row, token_id = tokenmapping.nonzero()

    right_frame_list.append(pd.DataFrame(np.vstack([vocabulary[token_id], right_df['id'].values[df_row]]).T, columns = ['token', 'id_r']))

right_keyed = pd.concat(right_frame_list)
right_keyed.drop_duplicates(inplace=True)

#append token_id to token as this will be more efficient to join with
right_token_ids = []
for token in right_keyed.token:
    right_token_ids.append(tokens_id_dct[token])

right_keyed['token_id'] = right_token_ids
right_keyed.sort_values(by='token_id',inplace=True)
right_keyed.set_index('token_id',inplace=True)
right_keyed.drop('token',axis=1,inplace=True)

aggregations = {
    'id_l': 'count'
}

joined = left_keyed.join(right_keyed, how='inner',lsuffix='_l',rsuffix='_r')
keys_grouped = joined.groupby(by=['id_l', 'id_r']).agg(aggregations)
keys_grouped.rename(columns={'id_l':'id_l count'}, inplace=True)
matched_records = keys_grouped[keys_grouped['id_l count'] >= token_match_min]
matched_records.reset_index(inplace=True)

print("match candidates identified --- %s seconds ---" % (time.time() - start_time))

left_df.rename(columns={'id':'id_l'},inplace=True)
right_df.rename(columns={'id':'id_r'},inplace=True)

left_match_data = left_df[['id_l','l_org_name','l_city','l_state','l_postal_code','l_web','l_clean_phone']].copy()
right_match_data = right_df[['id_r','r_org_name','r_city','r_state','r_postal_code','r_web']].copy()

#making sure keys are str, results in blank df otherwise
left_match_data.id_l = left_match_data.id_l.astype('str')
right_match_data.id_r = right_match_data.id_r.astype('str')
matched_records.id_l = matched_records.id_l.astype('str')
matched_records.id_r = matched_records.id_r.astype('str')

#merging matched_records df with original record data for ease of review
l_conc = pd.merge(matched_records, left_match_data, on='id_l')
full_conc = pd.merge(l_conc, right_match_data, on='id_r')

print("original data concatenated with matches --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "SCORING ORG NAME SIMULARITY..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#scoring match candidates based on edit distance of org names
def jaro_simularity(left_record, right_record):
    if len(left_record) > 0 and len(right_record) > 0:
        if isinstance(left_record, numbers.Integral) is False and isinstance(right_record, numbers.Integral) is False:
            return jaro_winkler(unicode(left_record, 'utf-8') or '', unicode(right_record, 'utf-8') or '')
def fuzz_partial(left_record, right_record):
    if len(left_record) > 0 and len(right_record) > 0:
        if isinstance(left_record, numbers.Integral) is False and isinstance(right_record, numbers.Integral) is False:
            return fuzz.partial_ratio(unicode(left_record, 'utf-8') or '', unicode(right_record, 'utf-8') or '') / float(100)
def fuzz_sort(left_record, right_record):
    if len(left_record) > 0 and len(right_record) > 0:
        if isinstance(left_record, numbers.Integral) is False and isinstance(right_record, numbers.Integral) is False:
            return fuzz.token_sort_ratio(unicode(left_record, 'utf-8') or '', unicode(right_record, 'utf-8') or '') / float(100)
def fuzz_set(left_record, right_record):
    if len(left_record) > 0 and len(right_record) > 0:
        if isinstance(left_record, numbers.Integral) is False and isinstance(right_record, numbers.Integral) is False:
            return fuzz.token_set_ratio(unicode(left_record, 'utf-8') or '', unicode(right_record, 'utf-8') or '') / float(100)

full_conc['l_org_name'] = full_conc['l_org_name'].astype('str')
full_conc['r_org_name'] = full_conc['r_org_name'].astype('str')

jaro_time = time.time()
full_conc['jaro_score'] = full_conc.apply(lambda x: jaro_simularity(x.l_org_name, x.r_org_name), axis=1)
print("jaro scores done --- %s seconds ---" % (time.time() - jaro_time))
partial_time = time.time()
full_conc['fuzz_partial_score'] = full_conc.apply(lambda x: fuzz_partial(x.l_org_name, x.r_org_name), axis=1)
print("fuzz partial scores done --- %s seconds ---" % (time.time() - partial_time))
sort_time = time.time()
full_conc['fuzz_sort_score'] = full_conc.apply(lambda x: fuzz_sort(x.l_org_name, x.r_org_name), axis=1)
print("fuzz sort scores done --- %s seconds ---" % (time.time() - sort_time))
set_time = time.time()
full_conc['fuzz_set_score'] = full_conc.apply(lambda x: fuzz_set(x.l_org_name, x.r_org_name), axis=1)
print("fuzz set scores done --- %s seconds ---" % (time.time() - set_time))
print ""

print("name simularity scored --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "CHECKING FOR STATE CODE MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

def sanitize_state(state):
    if isinstance(state,basestring) is True:
        return ''.join(c for c in (state or '') if c in 'abcdefghijklmnopqrstuvwxyz')
    else:
        return ''
    
def state_match(state_a, state_b):
    sanitized_state_a = str(sanitize_state(state_a))
    sanitized_state_b = str(sanitize_state(state_b))

    # if the value is too short, means it's fubar
    if len(sanitized_state_a) < 2 or len(sanitized_state_b) < 2:
        return 0
    if state_a == state_b:
        return 1
    else:
        return 0    

full_conc['state_match'] = full_conc.apply(lambda x: state_match(x.l_state, x.r_state), axis=1)

print("state codes checked --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "CHECKING FOR POSTAL CODE MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#scoring match candidates based on matching postal code

def sanitize_postal(postal):
    if isinstance(postal, basestring) is True:
        return ''.join(c for c in (postal or '') if c in '1234567890')
    if isinstance(postal, float) is False:
        return postal

def postal_simularity(postal_a, postal_b):
    sanitized_postal_a = str(sanitize_postal(postal_a))
    sanitized_postal_b = str(sanitize_postal(postal_b))

    # if the number is too short, means it's fubar
    if len(sanitized_postal_a) < 5 or len(sanitized_postal_b) < 5:
        return 0
    if float(max(len(sub) for sub in find_common_subsequences(sanitized_postal_a, sanitized_postal_b))) / 5 >= 1:
        return 1
    else:
        return 0
    
full_conc['zip_match'] = full_conc.apply(lambda x: postal_simularity(x.l_postal_code, x.r_postal_code), axis=1)
    
print("postal codes checked --- %s seconds ---" % (time.time() - start_time))
print ""

#start_time = time.time()
#print "CHECKING FOR PHONE MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#scoring match candidates based on matching phone
#def phone_simularity(phone_a, phone_b):

#if the number is too short, means it's fubar
#    if len(phone_a) < 10 or len(phone_b) < 10:
#        return 0
#    if float(max(len(sub) for sub in find_common_subsequences(phone_a, phone_b))) / 10 >= 1:
#        return 1
#    else:
#        return 0
    
#full_conc['phone_match'] = full_conc.apply(lambda x: phone_simularity(x.l_clean_phone, x.r_clean_phone), axis=1)
    
#print("phones checked --- %s seconds ---" % (time.time() - start_time))
#print ""

#test this.  may need to make more efficient but I think it should work
start_time = time.time()
print "DISTILLING STRONG ORG MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#calculate composite match score based on component scores and weights
full_conc['composite_match_score'] = full_conc.jaro_score * name_weight \
+ full_conc.fuzz_partial_score * name_weight \
+ full_conc.fuzz_sort_score * name_weight \
+ full_conc.fuzz_set_score * name_weight \
+ full_conc.zip_match * zip_weight \
+ full_conc.state_match * state_weight \
#+ full_conc.phone_match * phone_weight

org_matches = full_conc[full_conc.composite_match_score >= composite_score_min]

print("final matches isolated --- %s seconds ---" % (time.time() - start_time))
print ""

#full_conc[full_conc.composite_match_score < 3].sort_values(by='composite_match_score', ascending=False)
org_matches.sort_values(by='composite_match_score', ascending=False)

LOADING INITIAL DATAFRAMES...
Dataframes loaded --- 0.274999856949 seconds ---

PRE-PROCESSING...
states, phones normalized --- 0.544000148773 seconds ---

TOKENIZING, IDENTIFYING CANDIDATE MATCH PAIRS...
match candidates identified --- 11.003000021 seconds ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


original data concatenated with matches --- 11.5759999752 seconds ---

SCORING ORG NAME SIMULARITY...
jaro scores done --- 7.02999997139 seconds ---
fuzz partial scores done --- 19.1050000191 seconds ---
fuzz sort scores done --- 15.0749998093 seconds ---
fuzz set scores done --- 19.7090001106 seconds ---

name simularity scored --- 60.9760000706 seconds ---

CHECKING FOR STATE CODE MATCHES...
state codes checked --- 7.25200009346 seconds ---

CHECKING FOR POSTAL CODE MATCHES...
postal codes checked --- 22.4609999657 seconds ---

DISTILLING STRONG ORG DUPLICATES...
final duplicates isolated --- 0.0750000476837 seconds ---



Unnamed: 0,id_l,id_r,id_l count,l_org_name,l_city,l_state,l_postal_code,l_web,l_clean_phone,r_org_name,...,r_state,r_postal_code,r_web,jaro_score,fuzz_partial_score,fuzz_sort_score,fuzz_set_score,state_match,zip_match,composite_match_score
71909,49496,29800,6,American Association of School Personnel Admin...,Overland Park,ks,66210,aaspa.org,9133271222,American Association of School Personnel Admin...,...,ks,66210,www.aaspa.org,1.000000,1.00,1.00,1.00,1,1,5.000000
36565,93399,698100,7,Tau Kappa Epsilon International Fraternity,Indianapolis,in,46278,www.tke.org,3178726533,Tau Kappa Epsilon International Fraternity,...,in,46278-1940,www.tke.org,1.000000,1.00,1.00,1.00,1,1,5.000000
201096,59256,46316000,8,Eastern Winter Sports Representatives Association,White Haven,pa,18661,ewsra.org,5704437180,Eastern Winter Sports Representatives Association,...,pa,18661,,1.000000,1.00,1.00,1.00,1,1,5.000000
198438,56380,6644200,7,Maine Motor Transport Association,Augusta,me,04332-0857,mmta.com,2076234128,Maine Motor Transport Association,...,me,04332-0857,,1.000000,1.00,1.00,1.00,1,1,5.000000
198391,53020,7903700,7,Women's Basketball Coaches Association,Lilburn,ga,30047,wbca.org,7702798027,Women's Basketball Coaches Association,...,ga,30047,www.wbca.org,1.000000,1.00,1.00,1.00,1,1,5.000000
197995,84754,100021444,5,Association of International Petroleum Negotia...,Houston,tx,77079,aipn.org,2815587715,Association of International Petroleum Negotia...,...,tx,77079,www.aipn.org,1.000000,1.00,1.00,1.00,1,1,5.000000
116129,20716,533400,6,Navy League of the United States,Arlington,va,22201,navyleague.org,7035281775,Navy League of the United States,...,va,22201-3308,www.navyleague.org,1.000000,1.00,1.00,1.00,1,1,5.000000
185583,58289,29592100,4,Indiana Veterinary Medical Association,Indianapolis,in,46205-2898,invma.org,3179740888,Indiana Veterinary Medical Association,...,in,46205,www.avma.org/statevma/invna,1.000000,1.00,1.00,1.00,1,1,5.000000
102824,50731,6633400,5,Mathematical Association of America,Washington,dc,20036-1358,maa.org,2023875200,Mathematical Association of America,...,dc,20036-1358,www.maa.org,1.000000,1.00,1.00,1.00,1,1,5.000000
174966,55162,6605600,8,American Council of Engineering Companies of M...,Jefferson City,mo,65101-3113,acecmo.org,5736344080,American Council of Engineering Companies of M...,...,mo,65101-3113,www.acecmo.org,1.000000,1.00,1.00,1.00,1,1,5.000000


In [10]:
8 < len(cnt_dict)/10000

False

In [52]:
#this is an attempt to isolate tokens which are unique and should be double-counted to circumvent the token_match_min count
#needs to be tweaked to fine tune the output.  Intention is to concat this to joined prior to aggregation
bonus_point_tokens = []
for token in main_tokens_df[(main_tokens_df['count'] > 1) & (main_tokens_df['count']<=5)].token:
    bonus_point_tokens.append(tokens_id_dct[token])

intersection_bonus_tokens = set(bonus_point_tokens).intersection(set(list(joined.index)))    
    
bonus_token_joins = []
for token_id in intersection_bonus_tokens:
    bonus_token_joins.append(joined.loc[token_id])
    
bonus_joins = pd.concat(bonus_token_joins)
bonus_joins

Unnamed: 0,0,id_l,id_r
id_l,64220,,
id_r,6656100,,
2051,,57853,100015545
2051,,79529,100015545
2051,,92400,100015545
2051,,39927,100015545
2051,,19411,100015545
2051,,51652,100015545
2051,,38749,100015545
2051,,32719,100015545


In [50]:
bonus_joins = pd.concat(bonus_token_joins)

In [51]:
bonus_joins

Unnamed: 0,0,id_l,id_r
id_l,64220,,
id_r,6656100,,
2051,,57853,100015545
2051,,79529,100015545
2051,,92400,100015545
2051,,39927,100015545
2051,,19411,100015545
2051,,51652,100015545
2051,,38749,100015545
2051,,32719,100015545


In [28]:
bonus_point_tokens = []
for token in main_tokens_df[(main_tokens_df['count'] > 1) & (main_tokens_df['count']<=5)].token:
    bonus_point_tokens.append(tokens_id_dct[token])

In [30]:
len(bonus_point_tokens)

18600

In [40]:
type(set(list(joined.index)))

set

In [45]:
intersection_bonus_tokens = set(bonus_point_tokens).intersection(set(list(joined.index)))

In [46]:
type(intersection_bonus_tokens)

set

In [35]:
len(set(joined.index))

1754

In [256]:
def sequence_uniqueness(seq):
    return sum(1/org_cnt_dict[str.lower(t)]**0.5 for t in seq)

def name_similarity(a, b):
    a_tokens = set(tokenize_name(a))
    b_tokens = set(tokenize_name(b))
    a_uniq = sequence_uniqueness(a_tokens)
    b_uniq = sequence_uniqueness(b_tokens)

    return sequence_uniqueness(a_tokens.intersection(b_tokens))/(a_uniq * b_uniq) ** 0.5

In [232]:
org_tokens = []    
for word in left_df['l_org_name']:
    if isinstance(word, float) is False:
        org_tokens.append(tokenize_name(str(word)))

for word in right_df['r_org_name']:
    if isinstance(word, float) is False:
        org_tokens.append(tokenize_name(str(word)))
            
org_flat_list = [item for sublist in org_tokens for item in sublist]

#instantiate counter and use to count word frequencies in flat list
org_cnt = Counter()
for token in org_flat_list:
    org_cnt[token] += 1

org_cnt_dict = dict(org_cnt) #convert to dictionary

In [246]:
org_cnt_dict['hmm']

2

In [231]:
left_df.shape

(88517, 12)

In [295]:
start_time = time.time()
set('this is a name name hmm-161'.split())
print("finished --- %s seconds ---" % (time.time() - start_time))

finished --- 0.0019998550415 seconds ---


In [288]:
start_time = time.time()
set(tokenize_name('this is a name name hmm-161'))
print("finished --- %s seconds ---" % (time.time() - start_time))

finished --- 0.00200009346008 seconds ---


In [235]:
str.lower('Limited')

'limited'

In [234]:
cnt_dict['limited']

114

In [257]:
full_conc['uniq'] = full_conc.apply(lambda x: name_similarity(x.l_org_name, x.r_org_name), axis=1)

In [261]:
full_conc[full_conc.composite_match_score < 5].sort_values(by='uniq',ascending=False).head()

Unnamed: 0,id_l,id_r,id_l count,l_org_name,l_city,l_state,l_postal_code,l_web,l_clean_phone,r_org_name,...,r_postal_code,r_web,jaro_score,fuzz_partial_score,fuzz_sort_score,fuzz_set_score,state_match,zip_match,composite_match_score,uniq
1546542,55224,6209500,5,"Florida Transportation Builders Association, Inc.",Tallahassee,fl,32301,ftba.com,8509421404,Florida Transportation Builders Association Inc,...,32302-1208,www.ftba.com,0.991837,0.98,1.0,1.0,1,1,4.978878,1.0
1793265,62716,7059000,8,Professional Insurance Agents of Louisiana,Baton Rouge,la,70816,piaoflouisiana.com,2257667770,Professional Insurance Agents of Louisiana,...,70809-3423,www.piaoflouisiana.com,1.0,1.0,1.0,1.0,1,0,4.0,1.0
1167772,20818,6443900,5,NTCA The Rural Broadband Association,Arlington,va,22203,ntca.org,7033512000,NTCA - The Rural Broadband Association,...,22203-1801,www.ntca.org,0.933918,0.94,1.0,1.0,1,1,4.905439,1.0
2033604,64078,36888900,9,American Water Works Association - California/...,Rancho Cucamonga,ca,91730,ca-nv-awwa.org,9094817200,American Water Works Association California Ne...,...,91730,www.ca-nv-awwa.org,0.976092,0.95,1.0,1.0,1,1,4.944569,1.0
2050481,60552,100054118,7,Utah Highway Patrol Association,Salt Lake City,ut,84157,utahtrooper.com,8015543711,UTAH Highway Patrol Association,...,84157,www.utahtrooper.com,0.813364,0.9,1.0,1.0,1,1,4.785023,1.0


In [263]:
full_conc[full_conc.id_l=='90981']

Unnamed: 0,id_l,id_r,id_l count,l_org_name,l_city,l_state,l_postal_code,l_web,l_clean_phone,r_org_name,...,r_postal_code,r_web,jaro_score,fuzz_partial_score,fuzz_sort_score,fuzz_set_score,state_match,zip_match,composite_match_score,uniq
53050,90981,100007379,1,NeighborWorks America,Washington,dc,20002,www.neighborworks.org,2027604000,Industrial Minerals Association - North America,...,20036,www.ima-na.org,0.478035,0.57,0.39,0.5,1,0,2.453526,0.047244
66183,90981,100009793,1,NeighborWorks America,Washington,dc,20002,www.neighborworks.org,2027604000,Financial Services Institute Inc,...,20004,www.financialservices.org,0.458829,0.29,0.3,0.3,1,0,2.011622,0.0
79294,90981,100015009,1,NeighborWorks America,Washington,dc,20002,www.neighborworks.org,2027604000,Ironworker Management Progressive Action Coope...,...,20006,www.impact-net.org,0.563621,0.38,0.3,0.3,1,0,2.157716,0.0
92258,90981,100024433,1,NeighborWorks America,Washington,dc,20002,www.neighborworks.org,2027604000,Magnet Schools of America,...,20006,www.magnet.edu,0.623175,0.62,0.61,0.61,1,0,2.847381,0.035682
105146,90981,100044292,1,NeighborWorks America,Washington,dc,20002,www.neighborworks.org,2027604000,Cancer Support Community,...,20036,www.cancersupportcommunity.org,0.529762,0.33,0.36,0.36,1,0,2.184821,0.0
117926,90981,100053867,1,NeighborWorks America,Washington,dc,20002,www.neighborworks.org,2027604000,IT Alliance for Public Sector,...,20005,www.itic.org,0.522852,0.29,0.36,0.36,1,0,2.149639,0.0
131186,90981,100056550,1,NeighborWorks America,Washington,dc,20002,www.neighborworks.org,2027604000,National Association of Certified Professional...,...,20005,www.nacpm.org,0.521356,0.33,0.29,0.29,1,0,2.073517,0.0
143984,90981,100057977,1,NeighborWorks America,Washington,dc,20002,www.neighborworks.org,2027604000,New Leaders Council,...,20036,,0.560011,0.32,0.35,0.35,1,0,2.185008,0.0
156707,90981,100058477,1,NeighborWorks America,Washington,dc,20002,www.neighborworks.org,2027604000,National Association for Fixed Annuities,...,20004,www.nafa.com,0.484921,0.33,0.3,0.3,1,0,2.06119,0.0
169412,90981,100058650,1,NeighborWorks America,Washington,dc,20002,www.neighborworks.org,2027604000,SIFMA,...,20004,www.sifma.org,0.0,0.2,0.15,0.15,1,0,1.375,0.0


In [240]:
A = {2, 3, 5, 4}
B = {2, 5, 100}
C = {2, 3, 8, 9, 10}

print(B.intersection(A))
print(B.intersection(C))
print(A.intersection(C))
print(C.intersection(A, B))

set([2, 5])
set([2])
set([2, 3])
set([2])


In [241]:
type(A)

set

In [4]:
all_tokens = pd.concat([unique_tokens_df, main_tokens_df])
all_tokens.to_csv('all_tokens.csv')

In [212]:
full_conc[(full_conc.id_l == '90981') & (full_conc.id_r =='6165700')]

Unnamed: 0,id_l,id_r,id_l count,l_org_name,l_city,l_state,l_postal_code,l_web,l_clean_phone,r_org_name,...,r_state,r_postal_code,r_web,jaro_score,fuzz_partial_score,fuzz_sort_score,fuzz_set_score,state_match,zip_match,composite_match_score
1739641,90981,6165700,1,NeighborWorks America,Washington,dc,20002,www.neighborworks.org,2027604000,neighborworks America,...,sc,29455,www.nw.org,0.936508,0.9,1.0,1.0,0,0,2.877381


In [201]:
right_df[right_df.id_r =='6165700']

Unnamed: 0,Source,id_r,Cup Org ID,r_org_name,r_address1,r_city,r_state,r_postal_code,Zip Code 5 Numeric,CompanyType,r_web,MemberType,#ASAEMembers,Industry,Budget,EIN,#Staff
225,"IndividualMember, Not OrgMember",6165700,90981,neighborworks America,1684 Brownswood Road,Johns Island,sc,29455,29455,Philanthropic/Non-Profit/Charitable/Foundation...,www.nw.org,Non Member - Company Prospect,1,HOU - Housing,F - $10 million but less than $25 million,364194807,255


In [198]:
right_df[right_df.r_org_name == 'neighborworks America']

Unnamed: 0,Source,id_r,Cup Org ID,r_org_name,r_address1,r_city,r_state,r_postal_code,Zip Code 5 Numeric,CompanyType,r_web,MemberType,#ASAEMembers,Industry,Budget,EIN,#Staff
225,"IndividualMember, Not OrgMember",6165700,90981,neighborworks America,1684 Brownswood Road,Johns Island,sc,29455,29455,Philanthropic/Non-Profit/Charitable/Foundation...,www.nw.org,Non Member - Company Prospect,1,HOU - Housing,F - $10 million but less than $25 million,364194807,255


In [296]:
org_matches = full_conc[full_conc.composite_match_score >= composite_score_min]
org_matches.to_csv('1 token org matches w uniq col.csv')

In [107]:
left_df = pd.read_csv('all cupola orgs w phone_utf.csv',keep_default_na=False)
right_df = pd.read_table('asae_utf.txt',keep_default_na=False,error_bad_lines=False)

left_df.rename(columns={l_id:'id',l_name:'l_org_name',l_address1:'l_address1',l_city:'l_city',l_state:'l_state',l_zip:'l_postal_code',l_web:'l_web',l_phone:'l_phone',l_acronym:'l_acronym',l_alt_name:'l_alt_name'}, inplace=True)
right_df.rename(columns={r_id:'id',r_name:'r_org_name',r_address1:'r_address1',r_city:'r_city',r_state:'r_state',r_zip:'r_postal_code',r_web:'r_web'}, inplace=True)

left_frame_list = []
for colname in all_left_cols:
    tokenmapping = cv.fit_transform(left_df[colname])
    df_row, token_id = tokenmapping.nonzero()

    left_frame_list.append(pd.DataFrame(np.vstack([vocabulary[token_id], left_df['id'].values[df_row]]).T, columns = ['token', 'id_l']))

left_keyed = pd.concat(left_frame_list)
left_keyed.drop_duplicates(inplace=True)

left_token_ids = []
for token in left_keyed.token:
    left_token_ids.append(tokens_id_dct[token])

left_keyed['token_id'] = left_token_ids
left_keyed.sort_values(by='token_id',inplace=True)

In [50]:
all_left_cols = left_unique_token_columns + left_delta_token_columns

left_frame_list = []
for colname in all_left_cols:
    tokenmapping = cv.fit_transform(left_df[colname])
    df_row, token_id = tokenmapping.nonzero()

    left_frame_list.append(pd.DataFrame(np.vstack([vocabulary[token_id], left_df['id'].values[df_row]]).T, columns = ['token', 'id_l']))

left_keyed = pd.concat(left_frame_list)
left_keyed.drop_duplicates(inplace=True)

#append token_id to token as this will be more efficient to join with
left_token_ids = []
for token in left_keyed.token:
    left_token_ids.append(tokens_id_dct[token])

left_keyed['token_id'] = left_token_ids
left_keyed.sort_values(by='token_id',inplace=True)
left_keyed.set_index('token_id',inplace=True)
left_keyed.drop('token',axis=1,inplace=True)

all_right_cols = right_unique_token_columns + right_delta_token_columns

right_frame_list = []
for colname in all_right_cols:
    tokenmapping = cv.fit_transform(right_df[colname])
    df_row, token_id = tokenmapping.nonzero()

    right_frame_list.append(pd.DataFrame(np.vstack([vocabulary[token_id], right_df['id'].values[df_row]]).T, columns = ['token', 'id_r']))

right_keyed = pd.concat(right_frame_list)
right_keyed.drop_duplicates(inplace=True)

#append token_id to token as this will be more efficient to join with
right_token_ids = []
for token in right_keyed.token:
    right_token_ids.append(tokens_id_dct[token])

right_keyed['token_id'] = right_token_ids
right_keyed.sort_values(by='token_id',inplace=True)
right_keyed.set_index('token_id',inplace=True)
right_keyed.drop('token',axis=1,inplace=True)

In [178]:
all_right_cols = right_unique_token_columns + right_delta_token_columns

right_frame_list = []
for colname in all_right_cols:
    tokenmapping = cv.fit_transform(right_df[colname])
    df_row, token_id = tokenmapping.nonzero()

    right_frame_list.append(pd.DataFrame(np.vstack([vocabulary[token_id], right_df['id'].values[df_row]]).T, columns = ['token', 'id_r']))

right_keyed = pd.concat(right_frame_list)
right_keyed.drop_duplicates(inplace=True)

#append token_id to token as this will be more efficient to join with
right_token_ids = []
for token in right_keyed.token:
    right_token_ids.append(tokens_id_dct[token])

right_keyed['token_id'] = right_token_ids

In [203]:
right_keyed[right_keyed.token == 'neighborworks']

AttributeError: 'DataFrame' object has no attribute 'token'

In [205]:
right_df[right_df.id_r =='6165700'].r_org_name

225    neighborworks America
Name: r_org_name, dtype: object

In [210]:
joined[(joined.id_l == '90981') & (joined.id_r == '6165700')]

Unnamed: 0_level_0,id_l,id_r
token_id,Unnamed: 1_level_1,Unnamed: 2_level_1
36806,90981,6165700


In [111]:
left_keyed.loc[36806]

token    neighborworks
id_l             90981
Name: 36806, dtype: object

In [104]:
left_keyed[left_keyed.token=='neighborworks']

Unnamed: 0,token,id_l,token_id
68875,neighborworks,90981,36806


In [106]:
left_keyed.to_csv('left_keyed pre token sort.csv')

In [59]:
left_keyed[left_keyed.token=='neighborworks']

Unnamed: 0,token,id_l,token_id
68875,neighborworks,90981,36806


In [82]:
left_df[left_df.id ==90981]

Unnamed: 0,id,l_org_name,l_acronym,l_address1,address2,l_city,l_state,l_postal_code,l_alt_name,l_web,l_phone
31695,90981,NeighborWorks America,,999 N. Capitol St. NE,Suite 900,Washington,DC,20002,,www.neighborworks.org,202-760-4000


In [102]:
left_keyed[left_keyed.token=='neighborworks']

AttributeError: 'DataFrame' object has no attribute 'token'

In [167]:
left_keyed.head()

Unnamed: 0_level_0,id_l
token_id,Unnamed: 1_level_1
1,50181
7,16131
10,33262
11,49308
13,74073


In [172]:
#this is weird.  Why is this returning the ID-L
left_keyed.loc[45525]

id_l    90981
Name: 45525, dtype: object

In [175]:
right_keyed.loc[45525]

KeyError: u'the label [45525] is not in the [index]'

In [174]:
left_keyed[left_keyed.id_l == 90981]

Unnamed: 0_level_0,id_l
token_id,Unnamed: 1_level_1


In [93]:
tokens_dct['america']

0

In [98]:
all_tokens = pd.concat([unique_tokens_df, main_tokens_df])

In [101]:
all_tokens[all_tokens.token == 'neighborworks']

Unnamed: 0,token,count,flag
36806,neighborworks,3,1


In [94]:
tokens_dct

{'fawl': 0,
 'icparegions': 0,
 'wvtroopers': 0,
 'mdbc': 0,
 'scheuring': 0,
 'fjata': 0,
 'fawd': 0,
 'degussa': 1,
 'woods': 1,
 'gaf': 1,
 'woody': 1,
 'ohiocommunitycolleges': 0,
 'gae': 0,
 'ipfa': 0,
 'kidsfirst': 0,
 'ncuma': 0,
 'sprague': 0,
 'caney': 0,
 '5986': 0,
 'stonewatercontrols': 0,
 'epixpharma': 0,
 'rickman': 0,
 'tenino': 0,
 'bswhealth': 0,
 'osioutsourcing': 0,
 'naturopathic': 1,
 'pages': 1,
 '36601': 0,
 'trojan': 1,
 'pigment': 0,
 'gastars': 0,
 'tourister': 0,
 'segbw': 0,
 'ncbtmb': 0,
 'broward': 1,
 'bringing': 0,
 'lsservices': 0,
 'southernoffshorefishingassociation': 0,
 'rementor': 0,
 'wccfa': 0,
 'tichenorassociates': 0,
 'wooden': 1,
 'weele': 0,
 'intelsatgeneral': 0,
 'raoul': 0,
 'cyberpoint': 0,
 'crossrate': 1,
 'tcba': 0,
 'avenuecapital': 0,
 'schunk': 0,
 'afpminnesota': 0,
 'inkster': 0,
 'paragonsdc': 0,
 'tumeq': 0,
 '0058': 1,
 'stohr': 0,
 '0053': 0,
 '0052': 0,
 '0051': 1,
 '0050': 0,
 '0057': 1,
 '0056': 1,
 '0055': 1,
 '0054': 1,

In [87]:
left_keyed

Unnamed: 0_level_0,id_l
token_id,Unnamed: 1_level_1
6,737
6,78388
6,39222
6,67617
6,16131
7,7819
7,37771
7,101383
7,23611
7,41636


In [78]:
#but THIS isn't??
left_keyed[left_keyed.id_l == 90981]

Unnamed: 0_level_0,id_l
token_id,Unnamed: 1_level_1


In [75]:
right_keyed[right_keyed.token=='neighborworks']

Unnamed: 0,token,id_r
497,neighborworks,6165700


In [53]:
right_keyed[right_keyed.id_r ==6165700]

Unnamed: 0_level_0,id_r
token_id,Unnamed: 1_level_1


In [34]:
left_df[left_df.id == 90981]

Unnamed: 0,id,l_org_name,l_acronym,l_address1,address2,l_city,l_state,l_postal_code,l_alt_name,l_web,l_phone
31695,90981,NeighborWorks America,,999 N. Capitol St. NE,Suite 900,Washington,DC,20002,,www.neighborworks.org,202-760-4000


In [40]:
print filter(lambda x: '20002' in x, flat_list)

TypeError: 'in <string>' requires string as left operand, not int

In [170]:
tokens_id_dct['neighborworks']

45525

In [25]:
left_keyed[left_keyed.id_l==90981]

Unnamed: 0,token,id_l


In [171]:
tokens_dct['neighborworks']

1

In [129]:
type('')

str

In [167]:
full_conc.isnull().sum()

id_l             0
id_r             0
id_l count       0
l_org_name       0
l_city           0
l_state          0
l_postal_code    0
l_web            0
l_clean_phone    0
r_org_name       0
r_city           0
r_state          0
r_postal_code    0
r_web            0
r_clean_phone    0
dtype: int64

In [135]:
left_df.l_phone

0                #N/A
1        609-771-0101
2                #N/A
3                #N/A
4                #N/A
5                #N/A
6                #N/A
7                #N/A
8                #N/A
9        847-605-6000
10       800-987-3373
11       202-585-3100
12               #N/A
13               #N/A
14       978-692-4900
15       702-221-4780
16       202-298-8660
17       202-778-1800
18               #N/A
19               #N/A
20       972-671-8885
21               #N/A
22               #N/A
23               #N/A
24       407-644-6300
25       512-364-0656
26               #N/A
27               #N/A
28               #N/A
29               #N/A
             ...     
88487    212-922-1500
88488    443-391-7235
88489    504-522-4850
88490            #N/A
88491    408-536-6000
88492            #N/A
88493    407-581-1560
88494    862-261-7000
88495    319-337-1000
88496    405-737-2676
88497    240-450-0075
88498    410-347-7700
88499    408-400-1900
88500    484-653-3300
88501    9

In [138]:
left_reduced = left_df.head(1000).copy()

In [153]:
start_time = time.time()
print "CHECKING FOR PHONE MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#clean up non numeric characters in phones
def sanitize_phone(phone):
    if isinstance(phone, basestring) is True:
        return ''.join(c for c in (phone or '') if c in '1234567890')
    if isinstance(phone, float) is False:
        return phone
    
for phone in left_reduced.l_phone:
    left_reduced.l_phone.replace(sanitize_phone(phone))
    
print("phones checked --- %s seconds ---" % (time.time() - start_time))
print ""

CHECKING FOR PHONE MATCHES...
phones checked --- 0.444000005722 seconds ---



In [160]:
start_time = time.time()
print "CHECKING FOR PHONE MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#clean up non numeric characters in phones
for phone in left_df.l_phone:
    left_df.l_phone.replace(re.sub('[^0-9]','', phone))
    
print("phones checked --- %s seconds ---" % (time.time() - start_time))
print ""

CHECKING FOR PHONE MATCHES...
phones checked --- 0.408999919891 seconds ---



In [161]:
start_time = time.time()
print "CHECKING FOR PHONE MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

clean_phones = []
for phone in left_df.l_phone:
    clean_phones.append(re.sub('[^0-9]','', phone))
    
left_df['new_phone'] = clean_phones

print("phones checked --- %s seconds ---" % (time.time() - start_time))
print ""

CHECKING FOR PHONE MATCHES...
phones checked --- 0.319999933243 seconds ---



In [162]:
left_df.head()

Unnamed: 0,id_l,l_org_name,l_acronym,l_address1,address2,l_city,l_state,l_postal_code,l_alt_name,l_web,l_phone,new_phone
0,3640,Zyvex Performance,,,,Columbus,oh,,,www.zyvexpro.com,,
1,95526,Zytron,,20 Lexington Ave.,,Trenton,nj,8618.0,,www.zytron.com,609-771-0101,6097710101.0
2,70481,"Zynga, Inc.",,699 Eighth St.,,San Francisco,ca,94103.0,,,,
3,70482,Zynga (a client of Bay Bridge Strategies),,1300 Connecticut Ave. NW,Suite 600,Washington,dc,20036.0,,,,
4,81106,Zynerba Pharmaceuticals Inc,,80 W. Lancaster Ave.,Suite 300,Devon,pa,19333.0,,,,


In [95]:
duplicate_candidates.shape

(402544, 3)

In [92]:
duplicate_candidates.drop_duplicates(subset=('id_l','id_r'),inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [84]:
tokens_dct['corporation']

KeyError: 'corporation'

In [87]:
main_tokens_df

Unnamed: 0,token,count,flag


In [41]:
left_keyed.shape

(699341, 1)

In [42]:
right_keyed.shape

(355793, 1)

In [43]:
left_keyed.head()

Unnamed: 0_level_0,id_l
token_id,Unnamed: 1_level_1
0,64174
0,64174
1,90066
2,89826
2,89826


In [44]:
#define column names
l_id = 'organization_id'
l_name = 'org_name'
l_address1 = 'address1'
l_city = 'city'
l_state = 'state'
l_zip = 'postal_code'
l_phone = 'phone'
l_web = 'website'
l_acronym = 'acronym'
l_alt_name = 'alt_name'

r_id = 'rec_id'
r_name = 'org_name'
r_address1 = 'street1'
r_city = 'CITY'
r_state = 'STPROV'
r_zip = 'zip'
r_phone = 'PHONE'
r_web = 'url'
r_acronym = 'CBI_Acronym'

#set parameters
token_match_min = 2 # minimum number of matched tokens to be considered a match
token_limiter = .999 # percent of non-single tokens to tokenize, where rare tokens are at the bottom and common at the top
name_weight = .75 #note that this is really .75 * 4 because there are 4 org name simularity metrics
state_weight = 1
zip_weight = 1
phone_weight = 1
composite_score_min = 3.5 #minimum composite match score to be considered a match

start_time = time.time()
print "LOADING INITIAL DATAFRAMES..."

left_df = pd.read_csv('all cupola orgs w phone_utf.csv',keep_default_na=False)
right_df = pd.read_table('all_mint_orgs_2_19_2019_utf.txt',keep_default_na=False,error_bad_lines=False)

left_df.rename(columns={l_id:'id',l_name:'l_org_name',l_address1:'l_address1',l_city:'l_city',l_state:'l_state',l_zip:'l_postal_code',l_web:'l_web',l_phone:'l_phone',l_acronym:'l_acronym',l_alt_name:'l_alt_name'}, inplace=True)
right_df.rename(columns={r_id:'id',r_name:'r_org_name',r_address1:'r_address1',r_city:'r_city',r_state:'r_state',r_zip:'r_postal_code',r_web:'r_web',r_phone:'r_phone',r_acronym:'r_acronym'}, inplace=True)

print("Dataframes loaded --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "PRE-PROCESSING: NORMALIZE STATES..."
#normalize state codes
state_lkup = pd.read_csv('state_lkup.csv',keep_default_na=False)

from collections import defaultdict
state_dict = defaultdict(list)
for state, acronym in zip(state_lkup.state.values,state_lkup.acronym.values):
    state_dict[state].append(acronym)

left_df.l_state = left_df.l_state.str.lower()
left_df.l_state = left_df.l_state.replace(state_dict)
right_df.r_state = right_df.r_state.str.lower()
right_df.r_state = right_df.r_state.replace(state_dict)

print("states normalized --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "TOKENIZING, IDENTIFYING CANDIDATE MATCH PAIRS..."

left_unique_token_columns = [
    'l_acronym',
    'l_state', 
    'l_postal_code',
    'l_phone'
]

left_delta_token_columns = [
    'l_org_name',
    'l_alt_name',
    #'l_address1',
    #'l_address2',
    'l_city', 
    'l_web'
]

right_unique_token_columns = [
    'r_acronym',
    'r_state', 
    'r_postal_code',
    'r_phone'
]

right_delta_token_columns = [
    'r_org_name',
    #'r_alt_name',
    #'r_address1',
    #'r_address2',
    'r_city', 
    'r_web'
]

# lowercase the name and split on spaces, remove non-alphanumeric chars
def tokenize_name(name):
    if isinstance(name, basestring) is True:
        clean_name = ''.join(c if c.isalnum() else ' ' for c in name)
        return clean_name.lower().split()
    else:
        return name
    
unique_tokens = []    
for col in left_unique_token_columns:
    for word in left_df[col]:
        if isinstance(word, float) is False:
            unique_tokens.append(tokenize_name(str(word)))
            
for col in right_unique_token_columns:
    for word in right_df[col]:
        if isinstance(word, float) is False:
            unique_tokens.append(tokenize_name(str(word)))
            
unique_flat_list = [item for sublist in unique_tokens for item in sublist]

#instantiate counter and use to count word frequencies in flat list
u_cnt = Counter()
for token in unique_flat_list:
    u_cnt[token] += 1

u_cnt_dict = dict(u_cnt) #convert to dictionary

unique_tokens_df = pd.DataFrame(u_cnt_dict.items(), columns=['token', 'count'])
unique_tokens_df = unique_tokens_df.sort_values(by='count')  #sorting by count so that we can take the first x% of tokens by rare frequency

unique_token_flag = []
for index, value in enumerate(unique_tokens_df['count']):
    if value == 1:
        unique_token_flag.append(0)  #for any tokens occuring only once, we exclude
    else:
        unique_token_flag.append(1)

unique_tokens_df['flag'] = unique_token_flag        

all_other_words = []
for col in left_delta_token_columns:
    for word in left_df[col]:
        if isinstance(word, float) is False:
            unique_tokens.append(tokenize_name(str(word)))
            
for col in right_delta_token_columns:
    for word in right_df[col]:
        if isinstance(word, float) is False:
            unique_tokens.append(tokenize_name(str(word)))
            
flat_list = [item for sublist in all_other_words for item in sublist] #flatten list so it can be counted

#instantiate counter and use to count word frequencies in flat list
cnt = Counter()
for token in flat_list:
    cnt[token] += 1

cnt_dict = dict(cnt) #convert to dictionary

main_tokens_df = pd.DataFrame(cnt_dict.items(), columns=['token', 'count'])
main_tokens_df = main_tokens_df.sort_values(by='count')  #sorting by count so that we can take the first x% of tokens by rare frequency

main_token_flag = []
for index, value in enumerate(main_tokens_df['count']):
    if value == 1:
        main_token_flag.append(0)  #for any tokens occuring only once, we exclude
    elif index < int(main_tokens_df.shape[0] * token_limiter): #important line, we are cutting the top x% of frequently occuring tokens
        main_token_flag.append(1)
    else:
        main_token_flag.append(0)  #for the most common tokens, we exclude

main_tokens_df['flag'] = main_token_flag

all_tokens = pd.concat([unique_tokens_df, main_tokens_df])

all_tokens.drop('count',axis=1,inplace=True)
all_tokens['flag'] = all_tokens.flag.astype(int) #converting flags to int
tokens_dct = all_tokens.to_dict('split') #converting tokens_df to dictionary
tokens_dct=dict(tokens_dct['data']) #honestly can't remember why this works, something to do with conversion to dictionary

#preparing token_ids which will be used for joining left and right dfs
all_tokens.sort_values(by='flag',ascending=False,inplace=True)
all_tokens.drop_duplicates(subset='token',keep='first',inplace=True)
token_ids = all_tokens.index.get_level_values(0)
all_tokens['token_id'] = token_ids

all_tokens.drop('flag',axis=1,inplace=True)
all_tokens['token_id'] = all_tokens.token_id.astype(int)
token_id_dct = all_tokens.to_dict('split')
tokens_id_dct=dict(token_id_dct['data'])

vocabulary = np.array([w for w, c in tokens_dct.items() if c ==1]) #this works even without the ==1 and I don't know why
cv = CountVectorizer( vocabulary=vocabulary)

#now we are ready to tokenize left and right dataframes
all_left_cols = left_unique_token_columns + left_delta_token_columns

left_frame_list = []
for colname in all_left_cols:
    tokenmapping = cv.fit_transform(left_df[colname])
    df_row, token_id = tokenmapping.nonzero()

    left_frame_list.append(pd.DataFrame(np.vstack([vocabulary[token_id], left_df['id'].values[df_row]]).T, columns = ['token', 'id_l']))

left_keyed = pd.concat(left_frame_list)

#append token_id to token as this will be more efficient to join with
left_token_ids = []
for token in left_keyed.token:
    left_token_ids.append(tokens_id_dct[token])

left_keyed['token_id'] = left_token_ids

LOADING INITIAL DATAFRAMES...


Skipping line 944: expected 12 fields, saw 13
Skipping line 1901: expected 12 fields, saw 13
Skipping line 2385: expected 12 fields, saw 13
Skipping line 9323: expected 12 fields, saw 13
Skipping line 12962: expected 12 fields, saw 13
Skipping line 20622: expected 12 fields, saw 13
Skipping line 22166: expected 12 fields, saw 13
Skipping line 22459: expected 12 fields, saw 13
Skipping line 23323: expected 12 fields, saw 13
Skipping line 24264: expected 12 fields, saw 13
Skipping line 24413: expected 12 fields, saw 13
Skipping line 24984: expected 12 fields, saw 13
Skipping line 25965: expected 12 fields, saw 13
Skipping line 26570: expected 12 fields, saw 13
Skipping line 26617: expected 12 fields, saw 13
Skipping line 29375: expected 12 fields, saw 13
Skipping line 31384: expected 12 fields, saw 13
Skipping line 32471: expected 12 fields, saw 13
Skipping line 36022: expected 12 fields, saw 13
Skipping line 37449: expected 12 fields, saw 13
Skipping line 37651: expected 12 fields, saw 

Dataframes loaded --- 0.944999933243 seconds ---

PRE-PROCESSING: NORMALIZE STATES...
states normalized --- 1.28299999237 seconds ---

TOKENIZING, IDENTIFYING CANDIDATE MATCH PAIRS...
