In [1]:
import pandas as pd
import numpy as np
import math
from jellyfish import jaro_winkler
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from py_common_subseq import find_common_subsequences
import numbers
import time
from collections import Counter 
from fuzzywuzzy import fuzz

In [64]:
import io
import shutil

with io.open('recipients_reduced.csv', encoding='utf-8', errors='ignore') as source:
    with io.open('recipients_reduced_utf.csv', mode='w', encoding='utf-8') as target:
        shutil.copyfileobj(source,target)

In [2]:
start_time = time.time()
print "LOADING DATAFRAMES INTO MEMORY..."

df = pd.read_csv('RecipientTableUpdated_1.30.19_utf.csv',keep_default_na=False)

#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
#define column names
org_id = 'Recipient_ID'
org_name = 'RecipientName'
org_address1 = 'AddressLine1Txt'
org_city = 'CityNm'
org_state = 'StateAbbreviationCd'
org_zip = 'Zip'
org_web = 'WebsiteAddressTxt'

#set parameters
token_match_min = 2 # minimum number of matched tokens to be considered a match
token_limiter = .9996 # percent of non-single tokens to tokenize, where rare tokens are at the bottom and common at the top
name_weight = .75 #note that this is really .75 * 4 because there are 4 org name simularity metrics
state_weight = 1
zip_weight = 1
phone_weight = 1
composite_score_min = 3.5 #minimum composite match score to be considered a match
#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

df.rename(columns={org_id:'id',org_name:'org_name',org_address1:'address1',org_city:'city',org_state:'state',org_zip:'postal_code',org_web:'web'}, inplace=True)

print("dataframes loaded --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "PRE-PROCESSING: NORMALIZE STATES..."
#normalize state codes
state_lkup = pd.read_csv('state_lkup.csv',keep_default_na=False)

from collections import defaultdict
state_dict = defaultdict(list)
for state, acronym in zip(state_lkup.state.values,state_lkup.acronym.values):
    state_dict[state].append(acronym)

df.state = df.state.str.lower()
df.state = df.state.replace(state_dict)

print("states normalized --- %s seconds ---" % (time.time() - start_time))
print ""

#when you finish pre-processing, this step to split into left and right dataframes:
left_df = df.copy()
left_df.rename(columns={'org_name':'l_org_name','address1':'l_address1','city':'l_city','state':'l_state','postal_code':'l_postal_code','web':'l_web'}, inplace=True)
right_df = df.copy()
right_df.rename(columns={'org_name':'r_org_name','address1':'r_address1','city':'r_city','state':'r_state','postal_code':'r_postal_code','web':'r_web'}, inplace=True)

start_time = time.time()
print "CREATING DICTIONARY OF ALL UNIQUE TOKENS W INCLUDE FLAG..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
   
# for the left dataset
left_tokenized_columns = [
    'l_org_name',
    #'l_acronym',
    #'l_alt_name',
    'l_address1',
    #'l_address2',
    'l_city', 
    'l_state', 
    'l_postal_code',
    'l_web' 
    #'l_phone'
]

# and right
right_tokenized_columns = [
    'r_org_name',
    #'r_acronym',
    #'r_alt_name',
    'r_address1',
    #'r_address2',
    'r_city', 
    'r_state', 
    'r_postal_code',
    'r_web' 
    #'r_phone'
]

# lowercase the name and split on spaces, remove non-alphanumeric chars
def tokenize_name(name):
    if isinstance(name, basestring) is True:
        clean_name = ''.join(c if c.isalnum() else ' ' for c in name)
        return clean_name.lower().split()
    else:
        return name

unique_tokens = [] #we treat state and zips differently because we want to include ALl state and zip tokens as these are unique

#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< add chosen unique columns here from each df
for word in left_df['l_state']:
    if isinstance(word, float) is False:
        unique_tokens.append(tokenize_name(str(word)))

for word in left_df['l_postal_code']:
    if isinstance(word, float) is False:
        unique_tokens.append(tokenize_name(str(word)))

#for word in left_df['l_acronym']:
#    if isinstance(word, float) is False:
#        unique_tokens.append(tokenize_name(str(word)))

#for word in left_df['l_phone']:
#    if isinstance(word, float) is False:
#        unique_tokens.append(tokenize_name(str(word)))
#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        
unique_flat_list = [item for sublist in unique_tokens for item in sublist]

#instantiate counter and use to count word frequencies in flat list
u_cnt = Counter()
for token in unique_flat_list:
    u_cnt[token] += 1
    
u_cnt_dict = dict(u_cnt) #convert to dictionary

unique_tokens_df = pd.DataFrame(u_cnt_dict.items(), columns=['token', 'count'])
unique_tokens_df = unique_tokens_df.sort_values(by='count')  #sorting by count so that we can take the first x% of tokens by rare frequency

unique_token_flag = []
for index, value in enumerate(unique_tokens_df['count']):
    if value == 1:
        unique_token_flag.append(0)  #for any tokens occuring only once, we exclude
    else:
        unique_token_flag.append(1)

unique_tokens_df['flag'] = unique_token_flag        
        
all_other_words = [] #creating a list of all words used in just ONE of the dfs in selected columns, for counting to determine rarity

for word in left_df['l_org_name']:
    if isinstance(word, float) is False:
        all_other_words.append(tokenize_name(str(word)))

#for word in left_df['l_alt_name']:
#    if isinstance(word, float) is False:
#        all_other_words.append(tokenize_name(str(word)))

for word in left_df['l_address1']:
    if isinstance(word, float) is False:
        all_other_words.append(tokenize_name(str(word)))

for word in left_df['l_city']:
    if isinstance(word, float) is False:
        all_other_words.append(tokenize_name(str(word)))

for word in left_df['l_web']:
    if isinstance(word, float) is False:
        all_other_words.append(tokenize_name(str(word)))
    
flat_list = [item for sublist in all_other_words for item in sublist] #flatten list so it can be counted
#new_tokens = list(set(flat_list) - set(unique_flat_list)) #getting a list of tokens which are NOT included in the first flat_list

#instantiate counter and use to count word frequencies in flat list
cnt = Counter()
for token in flat_list:
    cnt[token] += 1
    
cnt_dict = dict(cnt) #convert to dictionary

main_tokens_df = pd.DataFrame(cnt_dict.items(), columns=['token', 'count'])
main_tokens_df = main_tokens_df.sort_values(by='count')  #sorting by count so that we can take the first x% of tokens by rare frequency

main_token_flag = []
for index, value in enumerate(main_tokens_df['count']):
    if value == 1:
        main_token_flag.append(0)  #for any tokens occuring only once, we exclude
    elif index < int(main_tokens_df.shape[0] * token_limiter): #important line, we are cutting the top x% of frequently occuring tokens
        main_token_flag.append(1)
    else:
        main_token_flag.append(0)  #for the most common tokens, we exclude

main_tokens_df['flag'] = main_token_flag

all_tokens = pd.concat([unique_tokens_df, main_tokens_df])

all_tokens.drop('count',axis=1,inplace=True)
all_tokens['flag'] = all_tokens.flag.astype(int) #converting flags to int
tokens_dct = all_tokens.to_dict('split') #converting tokens_df to dictionary
tokens_dct=dict(tokens_dct['data']) #honestly can't remember why this works, something to do with conversion to dictionary

#preparing token_ids which will be used for joining left and right dfs
all_tokens.sort_values(by='flag',ascending=False,inplace=True)
all_tokens.drop_duplicates(subset='token',keep='first',inplace=True)
token_ids = all_tokens.index.get_level_values(0)
all_tokens['token_id'] = token_ids

all_tokens.drop('flag',axis=1,inplace=True)
all_tokens['token_id'] = all_tokens.token_id.astype(int)
token_id_dct = all_tokens.to_dict('split')
tokens_id_dct=dict(token_id_dct['data'])

print("token dictionary created --- %s seconds ---" % (time.time() - start_time))
print ""
                           
start_time = time.time()
print "TOKENIZING LEFT DATAFRAME..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#NOTE: tokenizing the dataframes is the most computationally expensive part of this script.  

#tokenize left dataframe using CountVectorizer
vocabulary = np.array([w for w, b in tokens_dct.items() if b ==1]) #this works even without the ==1 and I don't know why
cv = CountVectorizer( vocabulary=vocabulary)

frame_list = []
for colname in left_tokenized_columns:
    tokenmapping = cv.fit_transform(left_df[colname])
    df_row, token_id = tokenmapping.nonzero()

    frame_list.append(pd.DataFrame(np.vstack([vocabulary[token_id], left_df['id'].values[df_row]]).T, columns = ['token', 'id']))

left_keyed = pd.concat(frame_list)

#append token_id to token as this will be more efficient to join with
left_token_ids = []
for token in left_keyed.token:
    left_token_ids.append(tokens_id_dct[token])

left_keyed['token_id'] = left_token_ids
left_keyed.set_index('token_id',inplace=True)
left_keyed.drop('token',axis=1,inplace=True)
    
print("left dataframe tokenized --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "TOKENIZING RIGHT DATAFRAME..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#tokenize right dataframe
right_keyed = left_keyed.copy()
    
print("right dataframe tokenized --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "JOINING LEFT & RIGHT TOKEN KEYS..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#join left and right token keys
joined = left_keyed.join(right_keyed, how='inner',lsuffix='_l',rsuffix='_r')

print("left & right token keys joined --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "GROUPING BY UNIQUE LEFT & RIGHT IDS & GETTING COUNT OF MATCHED TOKENS..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#aggregate to get a count of unique id_l and id_r pairs based on joined tokens, which is used to assess match strength
aggregations = {
    'id_l': 'count'
}

keys_grouped = joined.groupby(by=['id_l', 'id_r']).agg(aggregations)
keys_grouped.rename(columns={'id_l':'id_l count'}, inplace=True)

print("keys grouped & counted --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "REDUCING DOWN TO SUFFICIENT MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#here we are reducing down to those match candidates which meet the minimum threshold of matched tokens
#and crucially, we are excluding any matches to SELF
matched_records = keys_grouped[keys_grouped['id_l count'] >= token_match_min]

#adding a column for the id_r values, which are initially excluded becuase we are merging using the id_l
right_ids = matched_records.index.get_level_values('id_r')
matched_records['id_r'] = right_ids

#adding left Ids because I think we need them, probably a way to check index but meh
left_ids = matched_records.index.get_level_values('id_l')
matched_records['id_l'] = left_ids

duplicate_candidates = matched_records[matched_records['id_l'] <> matched_records['id_r']]

print("matches reduced --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "CONCATENATING MATCH IDS WITH ORIGINAL DATA..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#renaming ids to be linked to source.  this happens here and not the beginning so as to play nice with the prepare join keys generator
left_df.rename(columns={'id':'id_l'}, inplace=True)
right_df.rename(columns={'id':'id_r'}, inplace=True)

#creating left/right dataframes which contain only the most relevant details for reviewing the match strengths
left_match_data = left_df[['id_l','l_org_name','l_city','l_state','l_postal_code','l_web']].copy()
right_match_data = right_df[['id_r','r_org_name','r_city','r_state','r_postal_code','r_web']].copy()

#making sure keys are str, results in blank df otherwise
left_match_data.id_l = left_match_data.id_l.astype('str')
right_match_data.id_r = right_match_data.id_r.astype('str')
duplicate_candidates.id_l = duplicate_candidates.id_l.astype('str')
duplicate_candidates.id_r = duplicate_candidates.id_r.astype('str')

#merging matched_records df with original record data for ease of review
l_conc = pd.merge(duplicate_candidates, left_match_data, on='id_l')
full_conc = pd.merge(l_conc, right_match_data, on='id_r')

print("original data concatenated with matches --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "SCORING ORG NAME SIMULARITY..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#scoring match candidates based on edit distance of org names
def jaro_simularity(left_record, right_record):
    if isinstance(left_record, numbers.Integral) is False and isinstance(right_record, numbers.Integral) is False:
        return jaro_winkler(unicode(left_record, 'utf-8') or '', unicode(right_record, 'utf-8') or '')
def fuzz_partial(left_record, right_record):
    if isinstance(left_record, numbers.Integral) is False and isinstance(right_record, numbers.Integral) is False:
        return fuzz.partial_ratio(unicode(left_record, 'utf-8') or '', unicode(right_record, 'utf-8') or '') / float(100)
def fuzz_sort(left_record, right_record):
    if isinstance(left_record, numbers.Integral) is False and isinstance(right_record, numbers.Integral) is False:
        return fuzz.token_sort_ratio(unicode(left_record, 'utf-8') or '', unicode(right_record, 'utf-8') or '') / float(100)
def fuzz_set(left_record, right_record):
    if isinstance(left_record, numbers.Integral) is False and isinstance(right_record, numbers.Integral) is False:
        return fuzz.token_set_ratio(unicode(left_record, 'utf-8') or '', unicode(right_record, 'utf-8') or '') / float(100)

full_conc['l_org_name'] = full_conc['l_org_name'].astype('str')
full_conc['r_org_name'] = full_conc['r_org_name'].astype('str')

full_conc['jaro_score'] = full_conc.apply(lambda x: jaro_simularity(x.l_org_name, x.r_org_name), axis=1)
full_conc['fuzz_partial_score'] = full_conc.apply(lambda x: fuzz_partial(x.l_org_name, x.r_org_name), axis=1)
full_conc['fuzz_sort_score'] = full_conc.apply(lambda x: fuzz_sort(x.l_org_name, x.r_org_name), axis=1)
full_conc['fuzz_set_score'] = full_conc.apply(lambda x: fuzz_set(x.l_org_name, x.r_org_name), axis=1)

print("name simularity scored --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "CHECKING FOR STATE CODE MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

def sanitize_state(state):
    if isinstance(state,basestring) is True:
        return ''.join(c for c in (state or '') if c in 'abcdefghijklmnopqrstuvwxyz')
    else:
        return ''
    
def state_match(state_a, state_b):
    sanitized_state_a = str(sanitize_state(state_a))
    sanitized_state_b = str(sanitize_state(state_b))

    # if the value is too short, means it's fubar
    if len(sanitized_state_a) < 2 or len(sanitized_state_b) < 2:
        return 0
    if state_a == state_b:
        return 1
    else:
        return 0    

full_conc['state_match'] = full_conc.apply(lambda x: state_match(x.l_state, x.r_state), axis=1)

print("state codes checked --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "CHECKING FOR POSTAL CODE MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#scoring match candidates based on matching postal code

def sanitize_postal(postal):
    if isinstance(postal, basestring) is True:
        return ''.join(c for c in (postal or '') if c in '1234567890')
    if isinstance(postal, float) is False:
        return postal

def postal_simularity(postal_a, postal_b):
    sanitized_postal_a = str(sanitize_postal(postal_a))
    sanitized_postal_b = str(sanitize_postal(postal_b))

    # if the number is too short, means it's fubar
    if len(sanitized_postal_a) < 5 or len(sanitized_postal_b) < 5:
        return 0
    if float(max(len(sub) for sub in find_common_subsequences(sanitized_postal_a, sanitized_postal_b))) / 5 >= 1:
        return 1
    else:
        return 0
    
full_conc['zip_match'] = full_conc.apply(lambda x: postal_simularity(x.l_postal_code, x.r_postal_code), axis=1)
    
print("postal codes checked --- %s seconds ---" % (time.time() - start_time))
print ""

#start_time = time.time()
#print "CHECKING FOR PHONE MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#scoring match candidates based on matching postal code

#def sanitize_phone(phone):
#    if isinstance(phone, basestring) is True:
#        return ''.join(c for c in (phone or '') if c in '1234567890')
#    if isinstance(phone, float) is False:
#        return phone

#def postal_simularity(phone_a, phone_b):
#    sanitized_phone_a = str(sanitize_phone(phone_a))
#    sanitized_phone_b = str(sanitize_phone(phone_b))

    # if the number is too short, means it's fubar
#    if len(sanitized_phone_a) < 10 or len(sanitized_phone_b) < 10:
#        return 0
#    if float(max(len(sub) for sub in find_common_subsequences(sanitized_phone_a, sanitized_phone_b))) / 10 >= 1:
#        return 1
#    else:
#        return 0
    
#full_conc['phone_match'] = full_conc.apply(lambda x: phone_simularity(x.l_phone, x.r_phone), axis=1)
    
#print("phones checked --- %s seconds ---" % (time.time() - start_time))
#print ""

#test this.  may need to make more efficient but I think it should work
start_time = time.time()
print "DISTILLING STRONG, UNIQUE ORG DUPLICATES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#calculate composite match score based on component scores and weights
full_conc['composite_match_score'] = full_conc.jaro_score * name_weight \
+ full_conc.fuzz_partial_score * name_weight \
+ full_conc.fuzz_sort_score * name_weight \
+ full_conc.fuzz_set_score * name_weight \
+ full_conc.zip_match * zip_weight \
+ full_conc.state_match * state_weight \
#+ full_conc.phone_match * phone_weight

org_matches = full_conc[full_conc.composite_match_score >= composite_score_min]

match_tuples = list(zip(org_matches['id_l'], org_matches['id_r']))

unique_match_tuples = []
seen = set()
for tup in match_tuples:
    s = tuple(sorted(tup))
    if s not in seen:
        seen.add(s)
        unique_match_tuples.append(tup)

unique_flag = []
for tup in match_tuples:
    if tup in unique_match_tuples:
        unique_flag.append(1)
    else:
        unique_flag.append(0)
        
org_matches['unique_flag'] = unique_flag

unique_matches = org_matches[org_matches['unique_flag'] == 1]

print("final duplicates isolated --- %s seconds ---" % (time.time() - start_time))
print ""

#full_conc[full_conc.composite_match_score < 3].sort_values(by='composite_match_score', ascending=False)
unique_matches.sort_values(by='composite_match_score', ascending=False)

LOADING DATAFRAMES INTO MEMORY...


  interactivity=interactivity, compiler=compiler, result=result)


dataframes loaded --- 0.783999919891 seconds ---

PRE-PROCESSING: NORMALIZE STATES...
states normalized --- 1.10000014305 seconds ---

CREATING DICTIONARY OF ALL UNIQUE TOKENS W INCLUDE FLAG...
token dictionary created --- 12.7979998589 seconds ---

TOKENIZING LEFT DATAFRAME...
left dataframe tokenized --- 11.6840000153 seconds ---

TOKENIZING RIGHT DATAFRAME...
right dataframe tokenized --- 0.0130000114441 seconds ---

JOINING LEFT & RIGHT TOKEN KEYS...


MemoryError: 

In [3]:
left_keyed.shape

(1637244, 1)

In [74]:
full_conc.shape

(345622, 20)

In [64]:
left_keyed = left_keyed.reset_index(inplace=True)
left_keyed

In [120]:
right_keyed

Unnamed: 0_level_0,token,id
token_id,Unnamed: 1_level_1,Unnamed: 2_level_1
30381,501,1
81204,commons,1
130163,leaf,2
150776,womans,3
161424,place,3
86367,aarp,4
86367,aarp,5
116892,hospital,6
152281,abbot,6
154354,northwestern,6


In [62]:
joined[joined.id_l <> joined.id_r]

Unnamed: 0,token_l,id_l,token_r,id_r
0,501,1,turnanewleaf,2
0,12th,1,turnanewleaf,2
0,seattle,1,turnanewleaf,2
0,wa,1,turnanewleaf,2
0,98144,1,turnanewleaf,2
0,turnanewleaf,2,501,1
0,turnanewleaf,2,12th,1
0,turnanewleaf,2,seattle,1
0,turnanewleaf,2,wa,1
0,turnanewleaf,2,98144,1


In [56]:
full_conc[full_conc.id_l == '1']

Unnamed: 0,id_l count,id_r,id_l,l_org_name,l_city,l_state,l_postal_code,l_web,r_org_name,r_city,r_state,r_postal_code,r_web,jaro_score,fuzz_partial_score,fuzz_sort_score,fuzz_set_score,state_match,zip_match,composite_match_score
0,10,2,1,501 COMMONS,SEATTLE,wa,98144,HTTP://WWW.501COMMONS.ORG,A New Leaf,Tulsa,ok,74153,WWW.TURNANEWLEAF.ORG,0.39697,0.21,0.19,0.19,0,0,0.740227
6,4,3,1,501 COMMONS,SEATTLE,wa,98144,HTTP://WWW.501COMMONS.ORG,A WOMANS PLACE,DOYLESTOWN,pa,18901,WWW.AWOMANSPLACE.ORG,0.603896,0.45,0.48,0.48,0,0,1.510422
13,3,4,1,501 COMMONS,SEATTLE,wa,98144,HTTP://WWW.501COMMONS.ORG,AARP,NEW YORK,ny,10001,www.aarp.org,0.0,0.0,0.0,0.0,0,0,0.0


In [57]:
left_keyed[left_keyed.id == '1']

Unnamed: 0,token,id
0,501,1
1,commons,1
0,12th,1
1,1200,1
2,1101,1
0,seattle,1
0,wa,1
0,98144,1


In [59]:
right_keyed[right_keyed.id == '2']

Unnamed: 0,token,id
2,leaf,2
3,9810,2
4,42nd,2
1,tulsa,2
1,ok,2
0,turnanewleaf,2


In [90]:
full_conc[full_conc.id_l == '5']

Unnamed: 0,id_l count,id_r,id_l,l_org_name,l_city,l_state,l_postal_code,l_web,r_org_name,r_city,r_state,r_postal_code,r_web,jaro_score,fuzz_partial_score,fuzz_sort_score,fuzz_set_score,state_match,zip_match,composite_match_score
3,3,2,5,AARP FOUNDATIION,WASHINGTON,dc,20049,www.aarp.org/foundation,A New Leaf,Tulsa,ok,74153,WWW.TURNANEWLEAF.ORG,0.495833,0.3,0.31,0.31,0,0,1.061875
9,4,3,5,AARP FOUNDATIION,WASHINGTON,dc,20049,www.aarp.org/foundation,A WOMANS PLACE,DOYLESTOWN,pa,18901,WWW.AWOMANSPLACE.ORG,0.550595,0.36,0.4,0.4,0,0,1.282946
17,3,4,5,AARP FOUNDATIION,WASHINGTON,dc,20049,www.aarp.org/foundation,AARP,NEW YORK,ny,10001,www.aarp.org,0.85,1.0,0.4,1.0,0,0,2.4375
95,6,6,5,AARP FOUNDATIION,WASHINGTON,dc,20049,www.aarp.org/foundation,ABBOT NORTHWESTERN HOSPITAL,MINNEAPOLIS,mn,55407,SEE STATEMENT FOR WEBSITE ADDRESS,0.520833,0.25,0.28,0.28,0,0,0.998125
108,2,8,5,AARP FOUNDATIION,WASHINGTON,dc,20049,www.aarp.org/foundation,ABILENE CHRISTIAN UNIVERSITY,ABILENE,tx,79699,www.acu.edu,0.527381,0.38,0.32,0.32,0,0,1.160536
33963,4,7,5,AARP FOUNDATIION,WASHINGTON,dc,20049,www.aarp.org/foundation,Abbott House,Irvington,ny,10533,WWW.ABBOTTHOUSE.ORG,0.430556,0.17,0.29,0.29,0,0,0.885417
59172,2,13,5,AARP FOUNDATIION,WASHINGTON,dc,20049,www.aarp.org/foundation,ACADEMY OF ARTS AND SCIENCES,PHILADELPHIA,pa,19103,WWW.AMACAD.ORG,0.642857,0.44,0.41,0.41,0,0,1.427143
132668,2,11,5,AARP FOUNDATIION,WASHINGTON,dc,20049,www.aarp.org/foundation,AC PORTLAND,PORTLAND,or,97217,ACPORTLAND.ORG,0.645412,0.45,0.37,0.37,0,0,1.376559


In [105]:
valid_tokens = []
for token, value in tokens_dct.items():
    if value == 1:
        valid_tokens.append(token)
        
len(valid_tokens)

74579

In [None]:
[len(x) for x in tokens_dct.values()]
sum([len(x) for x in tokens_dct.values()])

In [95]:
tokens_dct
length_key = len(tokens_dct[1])

KeyError: 1

In [52]:
vocabulary = np.array([w for w, b in tokens_dct.items() if b])
len(vocabulary)

74579

In [91]:
left_keyed[left_keyed.id == '1']

Unnamed: 0,token,id
0,501,1
1,commons,1
0,12th,1
1,1200,1
2,1101,1
0,seattle,1
0,wa,1
0,98144,1


In [19]:
duplicate_candidates.shape

(3938, 3)

In [24]:
full_conc.head()

Unnamed: 0,id_l count,id_r,id_l,l_org_name,l_city,l_state,l_postal_code,l_web,r_org_name,r_city,r_state,r_postal_code,r_web,jaro_score,fuzz_partial_score,fuzz_sort_score,fuzz_set_score,state_match,zip_match,composite_match_score
0,3,10,1,501 COMMONS,SEATTLE,wa,98144,HTTP://WWW.501COMMONS.ORG,ABILITY COUNTS INC,CORONA,ca,92879,,0.577441,0.45,0.34,0.34,0,0,1.280581
1,2,10,8,ABILENE CHRISTIAN UNIVERSITY,ABILENE,tx,79699,www.acu.edu,ABILITY COUNTS INC,CORONA,ca,92879,,0.640212,0.44,0.3,0.3,0,0,1.260159
2,4,2,1,501 COMMONS,SEATTLE,wa,98144,HTTP://WWW.501COMMONS.ORG,A New Leaf,Tulsa,ok,74153,WWW.TURNANEWLEAF.ORG,0.39697,0.21,0.19,0.19,0,0,0.740227
3,2,2,4,AARP,NEW YORK,ny,10001,www.aarp.org,A New Leaf,Tulsa,ok,74153,WWW.TURNANEWLEAF.ORG,0.45,0.25,0.29,0.29,0,0,0.96
4,2,22,1,501 COMMONS,SEATTLE,wa,98144,HTTP://WWW.501COMMONS.ORG,ACT,SAN DIEGO,ca,92121,WWW.ACT.ORG,0.474747,0.33,0.14,0.14,0,0,0.813561


In [25]:
full_conc[full_conc.id_l =='1']

Unnamed: 0,id_l count,id_r,id_l,l_org_name,l_city,l_state,l_postal_code,l_web,r_org_name,r_city,r_state,r_postal_code,r_web,jaro_score,fuzz_partial_score,fuzz_sort_score,fuzz_set_score,state_match,zip_match,composite_match_score
0,3,10,1,501 COMMONS,SEATTLE,wa,98144,HTTP://WWW.501COMMONS.ORG,ABILITY COUNTS INC,CORONA,ca,92879,,0.577441,0.45,0.34,0.34,0,0,1.280581
2,4,2,1,501 COMMONS,SEATTLE,wa,98144,HTTP://WWW.501COMMONS.ORG,A New Leaf,Tulsa,ok,74153,WWW.TURNANEWLEAF.ORG,0.39697,0.21,0.19,0.19,0,0,0.740227
4,2,22,1,501 COMMONS,SEATTLE,wa,98144,HTTP://WWW.501COMMONS.ORG,ACT,SAN DIEGO,ca,92121,WWW.ACT.ORG,0.474747,0.33,0.14,0.14,0,0,0.813561
11,2,3,1,501 COMMONS,SEATTLE,wa,98144,HTTP://WWW.501COMMONS.ORG,A WOMANS PLACE,DOYLESTOWN,pa,18901,WWW.AWOMANSPLACE.ORG,0.603896,0.45,0.48,0.48,0,0,1.510422
13,4,4,1,501 COMMONS,SEATTLE,wa,98144,HTTP://WWW.501COMMONS.ORG,AARP,NEW YORK,ny,10001,www.aarp.org,0.0,0.0,0.0,0.0,0,0,0.0


In [6]:
#join left and right token keys
joined = left_keyed.join(right_keyed.set_index('token'), on='token', how='inner',lsuffix='_l',rsuffix='_r')

MemoryError: 

In [107]:
left_match_data.dtypes

id_l              int64
l_org_name       object
l_city           object
l_state          object
l_postal_code    object
l_web            object
dtype: object

In [None]:
l_conc = pd.merge(duplicate_candidates, left_match_data, on='id_l')

In [101]:
left_match_data[left_match_data.id_l == 101201]

Unnamed: 0,id_l,l_org_name,l_city,l_state,l_postal_code,l_web
464,101201,YUMA CONSERVATION DISTRICT,YUMA,co,80759,0


In [100]:
dups.head()

Unnamed: 0,id_l,id_r,id_l count
0,101201,130422,2
1,101201,26720,2
2,101201,73321,2
3,101201,85468,2
4,103563,221817,2


In [None]:
left_match_data.id_l = left_match_data.id_l.astype('str')
dups.id_l = duplicate_candidates.id_l.astype('str')

In [108]:
left_match_data.id_l = left_match_data.id_l.astype('str')
dups.id_l = duplicate_candidates.id_l.astype('str')

l_conc = pd.merge(dups,left_match_data,on='id_l')
l_conc.shape

(12348, 8)

In [96]:
dups

Unnamed: 0,id_l,id_r,id_l count
0,101201,130422,2
1,101201,26720,2
2,101201,73321,2
3,101201,85468,2
4,103563,221817,2
5,103563,59174,2
6,10518,14330,2
7,10518,67891,2
8,10518,67892,2
9,10518,77063,2


In [86]:
dups = duplicate_candidates[['id_l count']]
dups.reset_index(inplace=True)
dups

Unnamed: 0,id_l,id_r,id_l count
0,101201,130422,2
1,101201,26720,2
2,101201,73321,2
3,101201,85468,2
4,103563,221817,2
5,103563,59174,2
6,10518,14330,2
7,10518,67891,2
8,10518,67892,2
9,10518,77063,2


In [68]:
left_match_data.to_csv('left_match_data test.csv')

In [11]:
unique_matches.id_l.value_counts()

556       3
94532     2
99002     2
1370      2
46939     2
94860     2
91681     1
94523     1
16184     1
94517     1
90440     1
103138    1
19502     1
100013    1
82218     1
81722     1
90271     1
99230     1
46876     1
94269     1
99354     1
89208     1
76437     1
46606     1
88011     1
95116     1
99338     1
66735     1
89992     1
2055      1
         ..
94535     1
66627     1
94965     1
83192     1
94967     1
93557     1
18931     1
41457     1
28272     1
239       1
91500     1
91625     1
2022      1
82892     1
43459     1
80482     1
94870     1
2016      1
96990     1
27864     1
87381     1
85198     1
1741      1
100044    1
91211     1
48970     1
94536     1
37319     1
6086      1
103046    1
Name: id_l, Length: 61, dtype: int64

In [6]:
unique_matches[unique_matches.id_l == 556]

Unnamed: 0,id_l count,id_r,id_l,l_org_name,l_city,l_state,l_postal_code,l_web,r_org_name,r_city,r_state,r_postal_code,r_web,jaro_score,fuzz_partial_score,fuzz_sort_score,fuzz_set_score,zip_match,composite_match_score,unique_flag
34225,8,80445,556,University of Florida,Gainesville,FL,32611,www.uf.edu,University of Florida Student Government Assoc...,Gainesville,FL,32611,,0.880769,1.0,0.58,1.0,1,3.595577,1
34278,3,94528,556,University of Florida,Gainesville,FL,32611,www.uf.edu,University of Florida TREEO Center,Gainesville,FL,32611,www.treeo.ufl.edu,0.923529,1.0,0.76,1.0,1,3.762647,1
34294,3,101781,556,University of Florida,Gainesville,FL,32611,www.uf.edu,The University of Florida Board of Trustees,Gainesville,FL,32611,,0.718346,1.0,0.66,1.0,1,3.53376,1


In [139]:
unique_matches.to_csv('university_dups.csv')

In [144]:
jaro_simularity('emma is happy','saleh is emma')

0.6185897435897436

In [145]:
fuzz_partial('emma is happy','saleh is emma')

0.48

In [146]:
fuzz_set('emma is happy','saleh is emma')

0.7

In [147]:
fuzz_sort('emma is happy','saleh is emma')

0.54

In [117]:
unique_matches.to_csv('SF org duplicate pairs.csv')

In [75]:
matched_records_test = matched_records.reset_index()

match_tuples = []
for tup in matched_records_test[['id_l','id_r']]:
    match_tuples.append(tup)

matched_records_test[['id_l','id_r']]

Unnamed: 0,id_l,id_r
0,0012K00001XDfP3QAL,0012K00001XDfP3QAL
1,0012K00001XDfP3QAL,001A00000134Um0IAE
2,0012K00001XDfP4QAL,0012K00001XDfP4QAL
3,0012K00001XDfP4QAL,001A00000134hHYIAY
4,0012K00001XDfP4QAL,001A00000134hHlIAI
5,0012K00001XDfP4QAL,001A00000134hg6IAA
6,0012K00001XDfP4QAL,001A00000134hghIAA
7,0012K00001XDfP4QAL,001A0000017s1huIAA
8,0012K00001XDfP4QAL,001A000001UoXO8IAN
9,0012K00001XDfP4QAL,001A000001UoqL4IAJ


In [82]:
duplicate_candidates.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,id_l count,id_r,id_l
id_l,id_r,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0012K00001XDfP3QAL,001A00000134Um0IAE,3,001A00000134Um0IAE,0012K00001XDfP3QAL
0012K00001XDfP4QAL,001A00000134hHYIAY,2,001A00000134hHYIAY,0012K00001XDfP4QAL
0012K00001XDfP4QAL,001A00000134hHlIAI,2,001A00000134hHlIAI,0012K00001XDfP4QAL
0012K00001XDfP4QAL,001A00000134hg6IAA,4,001A00000134hg6IAA,0012K00001XDfP4QAL
0012K00001XDfP4QAL,001A00000134hghIAA,2,001A00000134hghIAA,0012K00001XDfP4QAL


In [81]:
duplicate_candidates.shape

(1201994, 3)

In [112]:
#test this.  may need to make more efficient but I think it should work
start_time = time.time()
print "dealing with tuples..."

match_tuples = list(zip(org_matches['id_l'], org_matches['id_r']))

unique_match_tuples = []
seen = set()
for tup in match_tuples:
    s = tuple(sorted(tup))
    if s not in seen:
        seen.add(s)
        unique_match_tuples.append(tup)

unique_flag = []
for tup in match_tuples:
    if tup in unique_match_tuples:
        unique_flag.append(1)
    else:
        unique_flag.append(0)
        
org_matches['unique_flag'] = unique_flag

unique_matches = org_matches[org_matches['unique_flag'] == 1]

print("tuples loaded --- %s seconds ---" % (time.time() - start_time))
print ""
unique_matches.shape

dealing with tuples...
tuples loaded --- 18.8469998837 seconds ---



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(13288, 19)

In [110]:
org_matches['unique_flag'] = unique_flag
org_matches.unique_flag.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


1    13288
0    13284
Name: unique_flag, dtype: int64

In [111]:
org_matches.to_csv('SF org matches with unique flag for test.csv')

In [104]:
match_tuples

[('0012K00001XDfP3QAL', '001A00000134Um0IAE'),
 ('0012K00001XDfP4QAL', '001A00000134hHYIAY'),
 ('0012K00001XDfP4QAL', '001A00000134hHlIAI'),
 ('0012K00001XDfP4QAL', '001A00000134hg6IAA'),
 ('0012K00001XDfP4QAL', '001A00000134hghIAA'),
 ('0012K00001XDfP4QAL', '001A0000017s1huIAA'),
 ('0012K00001XDfP4QAL', '001A000001UoXO8IAN'),
 ('0012K00001XDfP4QAL', '001A000001UoqL4IAJ'),
 ('0012K00001XDfP6QAL', '001A00000134TyKIAU'),
 ('0012K00001XDfP6QAL', '001A00000134Us2IAE'),
 ('0012K00001XDfP6QAL', '001A00000134Y88IAE'),
 ('0012K00001XDfP6QAL', '001A00000134hjbIAA'),
 ('0012K00001XDfP6QAL', '001A00000134lURIAY'),
 ('0012K00001XDfP6QAL', '001A00000134mMzIAI'),
 ('0012K00001XDfP6QAL', '001A00000134mNWIAY'),
 ('0012K00001XDfP6QAL', '001A00000134qExIAI'),
 ('0012K00001XDfP6QAL', '001A00000134rUOIAY'),
 ('0012K00001XDfP6QAL', '001A00000134tNWIAY'),
 ('0012K00001XDfP6QAL', '001A00000134utvIAA'),
 ('0012K00001XDfP6QAL', '001A00000134uu1IAA'),
 ('0012K00001XDfP6QAL', '001A00000134uu2IAA'),
 ('0012K00001

In [103]:
duplicate_candidates

Unnamed: 0_level_0,Unnamed: 1_level_0,id_l count,id_r,id_l,unique_flag
id_l,id_r,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0012K00001XDfP3QAL,001A00000134Um0IAE,3,001A00000134Um0IAE,0012K00001XDfP3QAL,1
0012K00001XDfP4QAL,001A00000134hHYIAY,2,001A00000134hHYIAY,0012K00001XDfP4QAL,1
0012K00001XDfP4QAL,001A00000134hHlIAI,2,001A00000134hHlIAI,0012K00001XDfP4QAL,1
0012K00001XDfP4QAL,001A00000134hg6IAA,4,001A00000134hg6IAA,0012K00001XDfP4QAL,1
0012K00001XDfP4QAL,001A00000134hghIAA,2,001A00000134hghIAA,0012K00001XDfP4QAL,1
0012K00001XDfP4QAL,001A0000017s1huIAA,15,001A0000017s1huIAA,0012K00001XDfP4QAL,1
0012K00001XDfP4QAL,001A000001UoXO8IAN,4,001A000001UoXO8IAN,0012K00001XDfP4QAL,1
0012K00001XDfP4QAL,001A000001UoqL4IAJ,6,001A000001UoqL4IAJ,0012K00001XDfP4QAL,1
0012K00001XDfP6QAL,001A00000134TyKIAU,4,001A00000134TyKIAU,0012K00001XDfP6QAL,1
0012K00001XDfP6QAL,001A00000134Us2IAE,2,001A00000134Us2IAE,0012K00001XDfP6QAL,1


In [102]:
duplicate_candidates.unique_flag.value_counts()

1    600997
0    600997
Name: unique_flag, dtype: int64

In [89]:
match_tuples

[('0012K00001XDfP3QAL', '001A00000134Um0IAE'),
 ('0012K00001XDfP4QAL', '001A00000134hHYIAY'),
 ('0012K00001XDfP4QAL', '001A00000134hHlIAI'),
 ('0012K00001XDfP4QAL', '001A00000134hg6IAA'),
 ('0012K00001XDfP4QAL', '001A00000134hghIAA'),
 ('0012K00001XDfP4QAL', '001A0000017s1huIAA'),
 ('0012K00001XDfP4QAL', '001A000001UoXO8IAN'),
 ('0012K00001XDfP4QAL', '001A000001UoqL4IAJ'),
 ('0012K00001XDfP6QAL', '001A00000134TyKIAU'),
 ('0012K00001XDfP6QAL', '001A00000134Us2IAE'),
 ('0012K00001XDfP6QAL', '001A00000134Y88IAE'),
 ('0012K00001XDfP6QAL', '001A00000134hjbIAA'),
 ('0012K00001XDfP6QAL', '001A00000134lURIAY'),
 ('0012K00001XDfP6QAL', '001A00000134mMzIAI'),
 ('0012K00001XDfP6QAL', '001A00000134mNWIAY'),
 ('0012K00001XDfP6QAL', '001A00000134qExIAI'),
 ('0012K00001XDfP6QAL', '001A00000134rUOIAY'),
 ('0012K00001XDfP6QAL', '001A00000134tNWIAY'),
 ('0012K00001XDfP6QAL', '001A00000134utvIAA'),
 ('0012K00001XDfP6QAL', '001A00000134uu1IAA'),
 ('0012K00001XDfP6QAL', '001A00000134uu2IAA'),
 ('0012K00001

In [90]:
unique_match_tuples

[('0012K00001XDfP3QAL', '001A00000134Um0IAE'),
 ('0012K00001XDfP4QAL', '001A00000134hHYIAY'),
 ('0012K00001XDfP4QAL', '001A00000134hHlIAI'),
 ('0012K00001XDfP4QAL', '001A00000134hg6IAA'),
 ('0012K00001XDfP4QAL', '001A00000134hghIAA'),
 ('0012K00001XDfP4QAL', '001A0000017s1huIAA'),
 ('0012K00001XDfP4QAL', '001A000001UoXO8IAN'),
 ('0012K00001XDfP4QAL', '001A000001UoqL4IAJ'),
 ('0012K00001XDfP6QAL', '001A00000134TyKIAU'),
 ('0012K00001XDfP6QAL', '001A00000134Us2IAE'),
 ('0012K00001XDfP6QAL', '001A00000134Y88IAE'),
 ('0012K00001XDfP6QAL', '001A00000134hjbIAA'),
 ('0012K00001XDfP6QAL', '001A00000134lURIAY'),
 ('0012K00001XDfP6QAL', '001A00000134mMzIAI'),
 ('0012K00001XDfP6QAL', '001A00000134mNWIAY'),
 ('0012K00001XDfP6QAL', '001A00000134qExIAI'),
 ('0012K00001XDfP6QAL', '001A00000134rUOIAY'),
 ('0012K00001XDfP6QAL', '001A00000134tNWIAY'),
 ('0012K00001XDfP6QAL', '001A00000134utvIAA'),
 ('0012K00001XDfP6QAL', '001A00000134uu1IAA'),
 ('0012K00001XDfP6QAL', '001A00000134uu2IAA'),
 ('0012K00001

In [69]:
duplicate_candidates[['id_l','id_r']]

Unnamed: 0_level_0,Unnamed: 1_level_0,id_l,id_r
id_l,id_r,Unnamed: 2_level_1,Unnamed: 3_level_1
0012K00001XDfP3QAL,001A00000134Um0IAE,0012K00001XDfP3QAL,001A00000134Um0IAE
0012K00001XDfP4QAL,001A00000134hHYIAY,0012K00001XDfP4QAL,001A00000134hHYIAY
0012K00001XDfP4QAL,001A00000134hHlIAI,0012K00001XDfP4QAL,001A00000134hHlIAI
0012K00001XDfP4QAL,001A00000134hg6IAA,0012K00001XDfP4QAL,001A00000134hg6IAA
0012K00001XDfP4QAL,001A00000134hghIAA,0012K00001XDfP4QAL,001A00000134hghIAA
0012K00001XDfP4QAL,001A0000017s1huIAA,0012K00001XDfP4QAL,001A0000017s1huIAA
0012K00001XDfP4QAL,001A000001UoXO8IAN,0012K00001XDfP4QAL,001A000001UoXO8IAN
0012K00001XDfP4QAL,001A000001UoqL4IAJ,0012K00001XDfP4QAL,001A000001UoqL4IAJ
0012K00001XDfP6QAL,001A00000134TyKIAU,0012K00001XDfP6QAL,001A00000134TyKIAU
0012K00001XDfP6QAL,001A00000134Us2IAE,0012K00001XDfP6QAL,001A00000134Us2IAE


In [38]:
#calculate composite match score based on component scores and weights
full_conc['composite_match_score'] = full_conc.jaro_score * name_weight \
+ full_conc.fuzz_partial_score * name_weight \
+ full_conc.fuzz_sort_score * name_weight \
+ full_conc.fuzz_set_score * name_weight \

In [44]:
full_conc.sort_values(by='composite_match_score',ascending=False).head(15)

Unnamed: 0,id_l count,id_r,id_l,l_org_name,l_city,l_state,l_postal_code,l_web,r_org_name,r_city,r_state,r_postal_code,r_web,jaro_score,fuzz_partial_score,fuzz_sort_score,fuzz_set_score,composite_match_score
1011610,7,001A000001UpgsDIAR,001A000001VTl5IIAT,AMERICAN SOCIETY FOR DERMATOLOGIC SURGERY,"ROLLING MEADOWS, ROLLING MEADOWS, ROLLING MEAD...","IL, IL, IL, IL, IL, IL, IL","60008, 60008, 60008, 60008, 60008, 60008, 60008","tcolin@asds.net, rlegoo@asds.net, jkremer@asds...",AMERICAN SOCIETY FOR DERMATOLOGIC SURGER,ROLLING MEADOWS,IL,60008,akuhn@asds.net,0.995122,1.0,0.99,0.99,2.981341
1011637,7,001A000001VTl5IIAT,001A000001UpgsDIAR,AMERICAN SOCIETY FOR DERMATOLOGIC SURGER,ROLLING MEADOWS,IL,60008,akuhn@asds.net,AMERICAN SOCIETY FOR DERMATOLOGIC SURGERY,"ROLLING MEADOWS, ROLLING MEADOWS, ROLLING MEAD...","IL, IL, IL, IL, IL, IL, IL","60008, 60008, 60008, 60008, 60008, 60008, 60008","tcolin@asds.net, rlegoo@asds.net, jkremer@asds...",0.995122,1.0,0.99,0.99,2.981341
934351,2,001A000001P6kmxIAB,001A000001P62JDIAZ,NAVY-MARINE CORPS RELIEF SOCIETY-SURVEY1,ARLINGTON,VA,22203,hr@nmcrs.org,NAVY-MARINE CORPS RELIEF SOCIETY-SURVEY,ARLINGTON,VA,22203,ann.carpenter@nmcrs.org,0.995,1.0,0.99,0.99,2.98125
922804,2,001A000001P62JDIAZ,001A000001P6kmxIAB,NAVY-MARINE CORPS RELIEF SOCIETY-SURVEY,ARLINGTON,VA,22203,ann.carpenter@nmcrs.org,NAVY-MARINE CORPS RELIEF SOCIETY-SURVEY1,ARLINGTON,VA,22203,hr@nmcrs.org,0.995,1.0,0.99,0.99,2.98125
1067533,2,001A000001P5e9DIAR,001A000001P5e9CIAR,FLORIDA SOCIETY OF ASSN EXECS-SURVEY,TALLAHASSEE,FL,32308,judy@fsae.org,FLORIDA SOCIETY OF ASSN EXECS-SURVEY1,TALLAHASSEE,FL,32308,deanna@fsae.org,0.994595,1.0,0.99,0.99,2.980946
1067520,2,001A000001P5e9CIAR,001A000001P5e9DIAR,FLORIDA SOCIETY OF ASSN EXECS-SURVEY1,TALLAHASSEE,FL,32308,deanna@fsae.org,FLORIDA SOCIETY OF ASSN EXECS-SURVEY,TALLAHASSEE,FL,32308,judy@fsae.org,0.994595,1.0,0.99,0.99,2.980946
1172720,2,001A000001P62E9IAJ,001A000001P6ku2IAB,SCHOOL NUTRITION ASSOCIATION-SURVEY,NATIONAL HARBOR,MD,20745,pmontague@schoolnutrition.org,SCHOOL NUTRITION ASSOCIATION-SURVEY1,NATIONAL HARBOR,MD,20745,pmontague@schoolnutrition.org,0.994444,1.0,0.99,0.99,2.980833
1172739,2,001A000001P6ku2IAB,001A000001P62E9IAJ,SCHOOL NUTRITION ASSOCIATION-SURVEY1,NATIONAL HARBOR,MD,20745,pmontague@schoolnutrition.org,SCHOOL NUTRITION ASSOCIATION-SURVEY,NATIONAL HARBOR,MD,20745,pmontague@schoolnutrition.org,0.994444,1.0,0.99,0.99,2.980833
925347,3,001A000001P6FivIAF,001A000001P628oIAB,ALLIANCE OF COMMUNITY HEALTH PLANS,WASHINGTON,DC,20006,cmoreschi@achp.org,ALLIANCE OF COMMUNITY HEALTH PLAN,WASHINGTON,DC,20006,mwilson@achp.org,0.994118,1.0,0.99,0.99,2.980588
920884,3,001A000001P628oIAB,001A000001P6FivIAF,ALLIANCE OF COMMUNITY HEALTH PLAN,WASHINGTON,DC,20006,mwilson@achp.org,ALLIANCE OF COMMUNITY HEALTH PLANS,WASHINGTON,DC,20006,cmoreschi@achp.org,0.994118,1.0,0.99,0.99,2.980588


In [105]:
org_matches = full_conc[full_conc.composite_match_score > 2.5]

org_matches.shape

(26572, 18)

In [47]:
unique_conc = pd.unique(full_conc[['id_l', 'id_r']].values.ravel('K'))

In [49]:
unique_conc

array(['0012K00001XDfP3QAL', '0012K00001XDfPnQAL', '0012K00001XDfQSQA1',
       ..., '001A000001WXPDNIA5', '001A000001WXPDOIA5',
       '001A000001WXPIXIA5'], dtype=object)

In [50]:
l = [(2,2),(2,3),(1,4),(2,2),(3,2)]
list(set(l))

[(3, 2), (2, 3), (1, 4), (2, 2)]

In [93]:
l

[(2, 2), (2, 3), (1, 4), (2, 2), (3, 2)]

In [95]:
b

[(2, 2), (2, 3), (1, 4)]

In [94]:
unique_flag = []
for tup in l:
    if tup in b:
        unique_flag.append(1)
    else:
        unique_flag.append(0)

unique_flag

[1, 1, 1, 1, 0]

In [108]:
b = []
seen = set()
for t in l:
    s = tuple(sorted(t))
    if s not in seen:
        seen.add(s)
        b.append(t)

b

[(2, 2), (2, 3), (1, 4)]

In [19]:
left_df.head()

Unnamed: 0,0012K00001XE80lQAD,"Catholic Charities, Diocese of St. Petersburg, Inc.",Association/Nonprofit,"jwayne@ccdosp.org, cmartinez@ccdosp.org","1213 16TH ST N, 1213 16TH ST N","SAINT PETERSBURG, SAINT PETERSBURG","FL, FL","33705-1032, 33705-1032","7278931313, 7278931313"
0,0012K00001XE80mQAD,"Carter Agency, Inc",Association/Nonprofit,dennisc@carteragcy.com,208 N MAPLE ST,CRESTON,IA,50801-2361,641-782-8516
1,0012K00001XE80qQAD,"CASCADE COUNTY, MONTANA",Local Govt,bfogerty@cascadecountymt.gov,325 2ND AVE N RM 111,GREAT FALLS,MT,59401-2517,4064546810
2,0012K00001XE80rQAD,Castro & Company LLC,Accounting Firm,rvellocido@castroco.com,1711 KING ST STE C,ALEXANDRIA,VA,22314-2740,7032294440
3,0012K00001XE80sQAD,COLUMBIA BASIN HEALTH ASSOCIATION,Association/Nonprofit,"alvat@cbha.org, lbarbour@cbhc.org, fcornelia@c...","140 E MAIN ST, 1410 GRANT ST. STE A301, GEORGE...","OTHELLO, DENVER, DENVER","WA, CO, CO","99344-1040, 80203, 80203","5094885256, 3038327594, 3038327594"
4,0012K00001XE80tQAD,CAYO LLC,Construction Company,wcs@cayo.us,1400 EVERMAN PKWY STE 127,FT WORTH,TX,76140-5036,8175686828


In [55]:
s = pd.DataFrame(['DC', 'MD', 'A'], index=["a", "b", 'c'])
t = pd.DataFrame([20005, 'MD'], index=["a", "b"])

In [56]:
joined = s.join(t,how='inner',lsuffix='_l',rsuffix='_r')

In [57]:
joined.head()

Unnamed: 0,0_l,0_r
a,DC,20005
b,MD,MD


In [58]:
def record_match(left_value,right_value):
    if len(left_value) < 2 or len(right_value) < 2:
        return 0
    if left_value == right_value:
        return 1
    else:
        return 0
    
joined['match'] = joined.apply(lambda x: record_match(x['0_l'], x['0_r']), axis=1)

TypeError: ("object of type 'int' has no len()", u'occurred at index a')

In [13]:
full_conc.to_csv('cupola_org_dup_output.csv')

In [15]:
full_conc.sort_values(by='id_l')

Unnamed: 0,id_l count,id_r,id_l,l_org_name,l_city,l_state,l_postal_code,l_web,l_acronym,l_alt_name,...,r_postal_code,r_web,r_acronym,r_alt_name,jaro_score,fuzz_partial_score,fuzz_sort_score,fuzz_set_score,zip_match,composite_match_score
0,2,49648,101,Averett University,Danville,VA,24541,,,,...,24541,americanhumorstudiesassociation.wordpress.com,AHSA,,0.589869,0.39,0.35,0.35,1,2.259902
1,2,25788,239,California State University (Fullerton),Fullerton,CA,92834-9480,www.fullerton.edu,,,...,92834,www.fullerton.edu,,,0.989744,0.97,1.00,1.00,1,3.969808
2,2,50156,284,Catholic University of America (DC),Washington,DC,20064,catholic.edu,CUA,,...,20064,cba.cua.edu,CBA,,0.624076,0.55,0.57,0.73,1,2.855557
4,2,85803,284,Catholic University of America (DC),Washington,DC,20064,catholic.edu,CUA,,...,H9S 5J9,www.cua.org,CUA,,0.605530,0.35,0.34,0.34,0,1.226647
1353,2,54331,321,University of Chicago (IL),Washington,DC,20003,www.uchicago.edu,,,...,20003,nrcma.org,NRCMA,,0.448210,0.35,0.30,0.30,1,2.048658
9,2,25905,321,University of Chicago (IL),Washington,DC,20003,www.uchicago.edu,,,...,20003,www.bcanda.com,,,0.511580,0.31,0.34,0.34,1,2.126185
8,2,25905,321,University of Chicago (IL),Washington,DC,20003,www.uchicago.edu,,,...,20003,www.bcanda.com,,,0.511580,0.31,0.34,0.34,1,2.126185
1352,2,54331,321,University of Chicago (IL),Washington,DC,20003,www.uchicago.edu,,,...,20003-1867,nrcma.org,NRCMA,,0.448210,0.35,0.30,0.30,1,2.048658
2698,3,94526,446,University of Dayton (OH),Dayton,OH,45469,udayton.edu,UD,,...,45469,udayton.edu/business/academics/centers/davisce...,DCPM UD,,0.840414,0.84,0.54,0.93,1,3.362810
2696,2,94525,446,University of Dayton (OH),Dayton,OH,45469,udayton.edu,UD,,...,45469-2316,www.udayton.edu/artssciences/academics/mathema...,,,0.859304,0.84,0.64,0.93,1,3.451978


In [17]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd

start_time = time.time()
print "CHECKING FOR STATE CODE MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#example df
d = {'id': [3,6], 'Org_Name': ['Acme Co Inc.', 'Buy Cats Here Inc'], 'Address': ['123 Hammond Lane, Washington, DC', 'Washington, DC 20456']}
left_df = pd.DataFrame(data=d)

#example token dictionary
tokens_dct = {
    'acme':1,
    'co':1,
    'inc':0,
    'buy':1,
    'cats':1,
    'here':1,
    '123':1,
    'hammond':1,
    'lane':0,
    'washington':1,
    'dc':1,
    '20456':1
}

# since you have a predefined vocabulary, you can fix it here
vocabulary = np.array([w for w, b in tokens_dct.items() if b])
cv = CountVectorizer( vocabulary=vocabulary)

frame_list = []
for colname in ['Org_Name', 'Address']:
    tokenmapping = cv.fit_transform(left_df[colname])
    df_row, token_id = tokenmapping.nonzero()

    frame_list.append(pd.DataFrame(np.vstack([vocabulary[token_id], left_df['id'].values[df_row]]).T, columns = ['token', 'id']))

left_keyed = pd.concat(frame_list)

left_keyed

print("name simularity scored --- %s seconds ---" % (time.time() - start_time))
print ""

CHECKING FOR STATE CODE MATCHES...
name simularity scored --- 0.00300002098083 seconds ---



In [31]:
vocabulary = np.array([w for w, b in tokens_dct.items() if b ==1])
cv = CountVectorizer( vocabulary=vocabulary)

frame_list = []
for colname in left_tokenized_columns:
    tokenmapping = cv.fit_transform(left_df[colname])
    df_row, token_id = tokenmapping.nonzero()

    frame_list.append(pd.DataFrame(np.vstack([vocabulary[token_id], left_df['id'].values[df_row]]).T, columns = ['token', 'id']))


KeyError: 'id'

In [48]:
main_tokens_df['count'].value_counts()

1    168306
Name: count, dtype: int64

In [43]:
unique_tokens_df.flag.value_counts()

1    19254
0     8859
Name: flag, dtype: int64

In [78]:
start_time = time.time()
print "LOADING DATAFRAMES INTO MEMORY..."

df = pd.read_csv('RecipientTableUpdated_1.30.19_utf.csv',keep_default_na=False)

#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
#define column names
org_id = 'Recipient_ID'
org_name = 'RecipientName'
org_address1 = 'AddressLine1Txt'
org_city = 'CityNm'
org_state = 'StateAbbreviationCd'
org_zip = 'Zip'
org_web = 'WebsiteAddressTxt'

#set parameters
token_match_min = 2 # minimum number of matched tokens to be considered a match
token_limiter = .9996 # percent of non-single tokens to tokenize, where rare tokens are at the bottom and common at the top
name_weight = .75 #note that this is really .75 * 4 because there are 4 org name simularity metrics
state_weight = 1
zip_weight = 1
phone_weight = 1
composite_score_min = 3.5 #minimum composite match score to be considered a match
#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

df.rename(columns={org_id:'id',org_name:'org_name',org_address1:'address1',org_city:'city',org_state:'state',org_zip:'postal_code',org_web:'web'}, inplace=True)

print("dataframes loaded --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "PRE-PROCESSING: NORMALIZE STATES..."
#normalize state codes
state_lkup = pd.read_csv('state_lkup.csv',keep_default_na=False)

from collections import defaultdict
state_dict = defaultdict(list)
for state, acronym in zip(state_lkup.state.values,state_lkup.acronym.values):
    state_dict[state].append(acronym)

df.state = df.state.str.lower()
df.state = df.state.replace(state_dict)

print("states normalized --- %s seconds ---" % (time.time() - start_time))
print ""

#when you finish pre-processing, this step to split into left and right dataframes:
left_df = df.copy()
left_df.rename(columns={'org_name':'l_org_name','address1':'l_address1','city':'l_city','state':'l_state','postal_code':'l_postal_code','web':'l_web'}, inplace=True)
right_df = df.copy()
right_df.rename(columns={'org_name':'r_org_name','address1':'r_address1','city':'r_city','state':'r_state','postal_code':'r_postal_code','web':'r_web'}, inplace=True)

start_time = time.time()
print "CREATING DICTIONARY OF ALL UNIQUE TOKENS W INCLUDE FLAG..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
   
# for the left dataset
left_tokenized_columns = [
    'l_org_name',
    #'l_acronym',
    #'l_alt_name',
    'l_address1',
    #'l_address2',
    'l_city', 
    'l_state', 
    'l_postal_code',
    'l_web' 
    #'l_phone'
]

# and right
right_tokenized_columns = [
    'r_org_name',
    #'r_acronym',
    #'r_alt_name',
    'r_address1',
    #'r_address2',
    'r_city', 
    'r_state', 
    'r_postal_code',
    'r_web' 
    #'r_phone'
]

# lowercase the name and split on spaces, remove non-alphanumeric chars
def tokenize_name(name):
    if isinstance(name, basestring) is True:
        clean_name = ''.join(c if c.isalnum() else ' ' for c in name)
        return clean_name.lower().split()
    else:
        return name

unique_tokens = [] #we treat state and zips differently because we want to include ALl state and zip tokens as these are unique

#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< add chosen unique columns here from each df
for word in left_df['l_state']:
    if isinstance(word, float) is False:
        unique_tokens.append(tokenize_name(str(word)))

for word in left_df['l_postal_code']:
    if isinstance(word, float) is False:
        unique_tokens.append(tokenize_name(str(word)))

#for word in left_df['l_acronym']:
#    if isinstance(word, float) is False:
#        unique_tokens.append(tokenize_name(str(word)))

#for word in left_df['l_phone']:
#    if isinstance(word, float) is False:
#        unique_tokens.append(tokenize_name(str(word)))
#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        
unique_flat_list = [item for sublist in unique_tokens for item in sublist]

#instantiate counter and use to count word frequencies in flat list
u_cnt = Counter()
for token in unique_flat_list:
    u_cnt[token] += 1
    
u_cnt_dict = dict(u_cnt) #convert to dictionary

unique_tokens_df = pd.DataFrame(u_cnt_dict.items(), columns=['token', 'count'])
unique_tokens_df = unique_tokens_df.sort_values(by='count')  #sorting by count so that we can take the first x% of tokens by rare frequency

unique_token_flag = []
for index, value in enumerate(unique_tokens_df['count']):
    if value == 1:
        unique_token_flag.append(0)  #for any tokens occuring only once, we exclude
    else:
        unique_token_flag.append(1)

unique_tokens_df['flag'] = unique_token_flag        
        
all_other_words = [] #creating a list of all words used in just ONE of the dfs in selected columns, for counting to determine rarity

for word in left_df['l_org_name']:
    if isinstance(word, float) is False:
        all_other_words.append(tokenize_name(str(word)))

#for word in left_df['l_alt_name']:
#    if isinstance(word, float) is False:
#        all_other_words.append(tokenize_name(str(word)))

for word in left_df['l_address1']:
    if isinstance(word, float) is False:
        all_other_words.append(tokenize_name(str(word)))

for word in left_df['l_city']:
    if isinstance(word, float) is False:
        all_other_words.append(tokenize_name(str(word)))

for word in left_df['l_web']:
    if isinstance(word, float) is False:
        all_other_words.append(tokenize_name(str(word)))
    
flat_list = [item for sublist in all_other_words for item in sublist] #flatten list so it can be counted
#new_tokens = list(set(flat_list) - set(unique_flat_list)) #getting a list of tokens which are NOT included in the first flat_list

#instantiate counter and use to count word frequencies in flat list
cnt = Counter()
for token in flat_list:
    cnt[token] += 1
    
cnt_dict = dict(cnt) #convert to dictionary

main_tokens_df = pd.DataFrame(cnt_dict.items(), columns=['token', 'count'])
main_tokens_df = main_tokens_df.sort_values(by='count')  #sorting by count so that we can take the first x% of tokens by rare frequency

main_token_flag = []
for index, value in enumerate(main_tokens_df['count']):
    if value == 1:
        main_token_flag.append(0)  #for any tokens occuring only once, we exclude
    elif index < int(main_tokens_df.shape[0] * token_limiter): #important line, we are cutting the top x% of frequently occuring tokens
        main_token_flag.append(1)
    else:
        main_token_flag.append(0)  #for the most common tokens, we exclude

main_tokens_df['flag'] = main_token_flag

all_tokens = pd.concat([unique_tokens_df, main_tokens_df])

all_tokens.drop('count',axis=1,inplace=True)
all_tokens['flag'] = all_tokens.flag.astype(int) #converting flags to int
tokens_dct = all_tokens.to_dict('split') #converting tokens_df to dictionary
tokens_dct=dict(tokens_dct['data']) #honestly can't remember why this works, something to do with conversion to dictionary

print("token dictionary created --- %s seconds ---" % (time.time() - start_time))
print ""
                           
start_time = time.time()
print "TOKENIZING LEFT DATAFRAME..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#NOTE: tokenizing the dataframes is the most computationally expensive part of this script.  

#tokenize left dataframe
vocabulary = np.array([w for w, b in tokens_dct.items() if b ==1]) #this works even without the ==1 and I don't know why
cv = CountVectorizer( vocabulary=vocabulary)

frame_list = []
for colname in left_tokenized_columns:
    tokenmapping = cv.fit_transform(left_df[colname])
    df_row, token_id = tokenmapping.nonzero()

    frame_list.append(pd.DataFrame(np.vstack([vocabulary[token_id], left_df['id'].values[df_row]]).T, columns = ['token', 'id']))

left_keyed = pd.concat(frame_list)
    
print("left dataframe tokenized --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "TOKENIZING RIGHT DATAFRAME..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#tokenize right dataframe
right_keyed = left_keyed.copy()
    
print("right dataframe tokenized --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "JOINING LEFT & RIGHT TOKEN KEYS..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#join left and right token keys
left_keyed.set_index('token',inplace=True)
right_keyed.set_index('token',inplace=True)
joined = left_keyed.join(right_keyed, how='inner',lsuffix='_l',rsuffix='_r')

print("left & right token keys joined --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "GROUPING BY UNIQUE LEFT & RIGHT IDS & GETTING COUNT OF MATCHED TOKENS..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#aggregate to get a count of unique id_l and id_r pairs based on joined tokens, which is used to assess match strength
aggregations = {
    'id_l': 'count'
}

keys_grouped = joined.groupby(by=['id_l', 'id_r']).agg(aggregations)
keys_grouped.rename(columns={'id_l':'id_l count'}, inplace=True)

print("keys grouped & counted --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "REDUCING DOWN TO SUFFICIENT MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#here we are reducing down to those match candidates which meet the minimum threshold of matched tokens
#and crucially, we are excluding any matches to SELF
matched_records = keys_grouped[keys_grouped['id_l count'] >= token_match_min]

#adding a column for the id_r values, which are initially excluded becuase we are merging using the id_l
right_ids = matched_records.index.get_level_values('id_r')
matched_records['id_r'] = right_ids

#adding left Ids because I think we need them, probably a way to check index but meh
left_ids = matched_records.index.get_level_values('id_l')
matched_records['id_l'] = left_ids

duplicate_candidates = matched_records[matched_records['id_l'] <> matched_records['id_r']]

print("matches reduced --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "CONCATENATING MATCH IDS WITH ORIGINAL DATA..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#renaming ids to be linked to source.  this happens here and not the beginning so as to play nice with the prepare join keys generator
left_df.rename(columns={'id':'id_l'}, inplace=True)
right_df.rename(columns={'id':'id_r'}, inplace=True)

#creating left/right dataframes which contain only the most relevant details for reviewing the match strengths
left_match_data = left_df[['id_l','l_org_name','l_city','l_state','l_postal_code','l_web']].copy()
right_match_data = right_df[['id_r','r_org_name','r_city','r_state','r_postal_code','r_web']].copy()

#making sure keys are str, results in blank df otherwise
left_match_data.id_l = left_match_data.id_l.astype('str')
right_match_data.id_r = right_match_data.id_r.astype('str')
duplicate_candidates.id_l = duplicate_candidates.id_l.astype('str')
duplicate_candidates.id_r = duplicate_candidates.id_r.astype('str')

#merging matched_records df with original record data for ease of review
l_conc = pd.merge(duplicate_candidates, left_match_data, on='id_l')
full_conc = pd.merge(l_conc, right_match_data, on='id_r')

LOADING DATAFRAMES INTO MEMORY...
dataframes loaded --- 0.72200012207 seconds ---

PRE-PROCESSING: NORMALIZE STATES...
states normalized --- 2.17199993134 seconds ---

CREATING DICTIONARY OF ALL UNIQUE TOKENS W INCLUDE FLAG...
token dictionary created --- 8.1890001297 seconds ---

TOKENIZING LEFT DATAFRAME...
left dataframe tokenized --- 8.12400007248 seconds ---

TOKENIZING RIGHT DATAFRAME...
right dataframe tokenized --- 0.0279998779297 seconds ---

JOINING LEFT & RIGHT TOKEN KEYS...


MemoryError: 

In [77]:
left_keyed.set_index('token',inplace=True)
left_keyed[left_keyed.id =='1']

KeyError: 'token'

In [113]:
joined = left_keyed.join(right_keyed, how='inner',lsuffix='_l',rsuffix='_r')

print("left & right token keys joined --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "GROUPING BY UNIQUE LEFT & RIGHT IDS & GETTING COUNT OF MATCHED TOKENS..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#aggregate to get a count of unique id_l and id_r pairs based on joined tokens, which is used to assess match strength
aggregations = {
    'id_l': 'count'
}

keys_grouped = joined.groupby(by=['id_l', 'id_r']).agg(aggregations)
keys_grouped.rename(columns={'id_l':'id_l count'}, inplace=True)

print("keys grouped & counted --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "REDUCING DOWN TO SUFFICIENT MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#here we are reducing down to those match candidates which meet the minimum threshold of matched tokens
#and crucially, we are excluding any matches to SELF
matched_records = keys_grouped[keys_grouped['id_l count'] >= token_match_min]

(1637244, 2)

In [117]:
joined[joined.token_l == 'cingular']

Unnamed: 0,token_l,id_l,token_id_l,token_r,id_r,token_id_r
6224,cingular,2733,47627,cingular,2733,47627
6224,cingular,2733,47627,us,2921,23234
6224,cingular,2733,47627,harrisburg,5656,55512
6224,cingular,2733,47627,ia,6417,16433
6224,cingular,2733,47627,99501,7211,2273
6224,cingular,2733,47627,archeworks,28487,136556
6226,cingular,2734,47627,cingular,2734,47627
6226,cingular,2734,47627,marquette,2922,153399
6226,cingular,2734,47627,dallas,5658,83591
6226,cingular,2734,47627,sd,6419,109385


In [118]:
left_keyed

Unnamed: 0,token,id,token_id
0,501,1,30381
1,commons,1,81204
2,leaf,2,130163
3,womans,3,150776
4,place,3,161424
5,aarp,4,86367
6,aarp,5,86367
7,hospital,6,116892
8,abbot,6,152281
9,northwestern,6,154354
