In [None]:
import pandas as pd
import numpy as np
import math
from jellyfish import jaro_winkler
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from py_common_subseq import find_common_subsequences
import numbers
import time
from collections import Counter 
from fuzzywuzzy import fuzz
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
#NOTES
#removing duplicate token-ID keys inplace caused some strange ID issue, so I am not removing duplicates anymore.  could be improved.
#unique_token_freq_max should really be derived dynamically by the number of tokens, some math
#zip matcher does not handle foreign postal codes eg ZVR OTS
#need to add code to remove duplicate ID pairs: A-B and B-A are the same - actually, Wait, do I?  I don't think so.  that's only for de-dup
#keep_default_na = True I think is necessary as we need to purge NULL.  

In [None]:
import io
import shutil

with io.open('990 match data.csv', encoding='utf-8', errors='ignore') as source:
    with io.open('990 match data_utf.csv', mode='w', encoding='utf-8') as target:
        shutil.copyfileobj(source,target)

In [None]:
overall_time = time.time()

#define column names
l_id = 'organization_id'
l_name = 'org_name'
l_address1 = 'address1'
l_city = 'city'
l_state = 'state'
l_zip = 'postal_code'
l_phone = 'phone'
l_web = 'website'
l_acronym = 'acronym'
l_alt_name = 'alt_name'

r_id = 'ein'
r_name = 'organizationname'
r_address1 = 'FA_AddressLine1Txt'
r_city = 'FA_CityNm'
r_state = 'FA_StateAbbreviationCd'
r_zip = 'FA_ZIPCd'
r_phone = 'F_PhoneNum'
r_web = 'WebsiteAddressTxt'
#r_acronym = 'CBI_Acronym'

#set parameters
token_match_min = 2 # minimum number of matched tokens to be considered a match
token_limiter = .995 # percent of non-single tokens to tokenize, where rare tokens are at the bottom and common at the top
unique_token_freq_max = 5 #threshold <= to a token is considered "unique" and links to these tokens are counted double
name_weight = .75 #note that this is really .75 * 4 because there are 4 org name simularity metrics
name_uniqueness_weight = 1.5
state_weight = 1
zip_weight = 1
phone_weight = 2
domain_weight = 2
name_score_min = 3
composite_score_min = 4 #minimum composite match score to be considered a match

start_time = time.time()
print "LOADING INITIAL DATAFRAMES..."

left_df = pd.read_csv('all cupola orgs w phone_db_utf.csv',keep_default_na=True)
right_df = pd.read_csv('990 match data_utf.csv',keep_default_na=True,error_bad_lines=False)

left_df.rename(columns={l_id:'id',l_name:'l_org_name',l_address1:'l_address1',l_city:'l_city',l_state:'l_state',l_zip:'l_postal_code',l_web:'l_web',l_phone:'l_phone',l_acronym:'l_acronym',l_alt_name:'l_alt_name'}, inplace=True)
right_df.rename(columns={r_id:'id',r_name:'r_org_name',r_address1:'r_address1',r_city:'r_city',r_state:'r_state',r_zip:'r_postal_code',r_web:'r_web',r_phone:'r_phone'}, inplace=True)

left_df = left_df.replace(np.nan, '', regex=True)
right_df = right_df.replace(np.nan, '', regex=True)

right_df.r_org_name = right_df.r_org_name.astype('str')
right_df.r_address1 = right_df.r_address1.astype('str')
right_df.r_city = right_df.r_city.astype('str')
right_df.r_state = right_df.r_state.astype('str')
right_df.r_postal_code = right_df.r_postal_code.astype('str')
right_df.r_phone = right_df.r_phone.astype('str')
right_df.r_web = right_df.r_web.astype('str')

print("Dataframes loaded --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "PRE-PROCESSING..."
#normalize state codes
state_lkup = pd.read_csv('state_lkup.csv',keep_default_na=False)

from collections import defaultdict
state_dict = defaultdict(list)
for state, acronym in zip(state_lkup.state.values,state_lkup.acronym.values):
    state_dict[state].append(acronym)

left_df.l_state = left_df.l_state.replace(np.nan, '', regex=True).str.lower()
left_df.l_state = left_df.l_state.replace(state_dict)
right_df.r_state = right_df.r_state.replace(np.nan, '', regex=True).str.lower()
right_df.r_state = right_df.r_state.replace(state_dict)

#clean up non numeric characters in phones
l_clean_phones = []
left_df.l_phone = left_df.l_phone.astype('str')
left_df.l_phone = left_df.l_phone.replace(np.nan, '', regex=True)
for phone in left_df.l_phone:
    l_clean_phones.append(re.sub('[^0-9]','', phone))
    
left_df['l_clean_phone'] = l_clean_phones

r_clean_phones = []
right_df.r_phone = right_df.r_phone.astype('str')
right_df.r_phone = right_df.r_phone.replace(np.nan, '', regex=True)
for phone in right_df.r_phone:
    r_clean_phones.append(re.sub('[^0-9]','', phone))

right_df['r_clean_phone'] = r_clean_phones

#isolate domains from web URLs
l_domains = []
left_df.l_web = left_df.l_web.replace(np.nan, '', regex=True).str.lower()
for web in left_df.l_web:
    l_domains.append(web.split('//')[-1].split('/')[0].strip('www.'))
left_df['l_domain'] = l_domains
    
r_domains = []
right_df.r_web = right_df.r_web.replace(np.nan, '', regex=True).str.lower()
for web in right_df.r_web:
    r_domains.append(web.split('//')[-1].split('/')[0].strip('www.'))
right_df['r_domain'] = r_domains

print("states, phones, domains normalized --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "TOKENIZING, IDENTIFYING CANDIDATE MATCH PAIRS..."

left_unique_token_columns = [ #tokens here circumvent the token_limiter which is cutting off the most commonly occuring tokens
    'l_state', 
    'l_postal_code',
    'l_clean_phone'

]

left_delta_token_columns = [
    'l_acronym',
    'l_org_name',
    'l_alt_name',
    #'l_address1',
    #'l_address2',
    'l_city',
    'l_domain'
]

right_unique_token_columns = [
    'r_state', 
    'r_postal_code',
    'r_clean_phone'
]

right_delta_token_columns = [
    #'r_acronym',
    'r_org_name',
    #'r_alt_name',
    #'r_address1',
    #'r_address2',
    'r_city',
    'r_domain'
]

# lowercase the name and split on spaces, remove non-alphanumeric chars
def tokenize_name(name):
    if isinstance(name, basestring) is True:
        clean_name = ''.join(c if c.isalnum() else ' ' for c in name)
        return clean_name.lower().split()
    else:
        return name
    
unique_tokens = []    
for col in left_unique_token_columns:
    for word in left_df[col]:
        if isinstance(word, float) is False:
            unique_tokens.append(tokenize_name(str(word)))
            
for col in right_unique_token_columns:
    for word in right_df[col]:
        if isinstance(word, float) is False:
            unique_tokens.append(tokenize_name(str(word)))
            
unique_flat_list = [item for sublist in unique_tokens for item in sublist]

#instantiate counter and use to count word frequencies in flat list
u_cnt = Counter()
for token in unique_flat_list:
    u_cnt[token] += 1

u_cnt_dict = dict(u_cnt) #convert to dictionary

unique_tokens_df = pd.DataFrame(u_cnt_dict.items(), columns=['token', 'count'])
unique_tokens_df = unique_tokens_df.sort_values(by='count')  #sorting by count so that we can take the first x% of tokens by rare frequency

#consider waiting to do the count flag thing later, instead use some type of "token type" code
unique_token_flag = []
for index, value in enumerate(unique_tokens_df['count']):
    if value == 1:
        unique_token_flag.append(0)  #for any tokens occuring only once, we exclude
    else:
        unique_token_flag.append(1)

unique_tokens_df['flag'] = unique_token_flag        

all_other_words = []
for col in left_delta_token_columns:
    for word in left_df[col]:
        if isinstance(word, float) is False:
            all_other_words.append(tokenize_name(str(word)))
            
for col in right_delta_token_columns:
    for word in right_df[col]:
        if isinstance(word, float) is False:
            all_other_words.append(tokenize_name(str(word)))
            
flat_list = [item for sublist in all_other_words for item in sublist] #flatten list so it can be counted

#instantiate counter and use to count word frequencies in flat list
cnt = Counter()
for token in flat_list:
    cnt[token] += 1

cnt_dict = dict(cnt) #convert to dictionary

main_tokens_df = pd.DataFrame(cnt_dict.items(), columns=['token', 'count'])
main_tokens_df = main_tokens_df.sort_values(by='count')  #sorting by count so that we can take the first x% of tokens by rare frequency

#wait to do count until joined with unique tokens?
main_token_flag = []
for index, value in enumerate(main_tokens_df['count']):
    if value == 1:
        main_token_flag.append(0)  #for any tokens occuring only once, we exclude
    elif index < int(main_tokens_df.shape[0] * token_limiter): #important line, we are cutting the top x% of frequently occuring tokens
        main_token_flag.append(1)
    else:
        main_token_flag.append(0)  #for the most common tokens, we exclude

main_tokens_df['flag'] = main_token_flag

all_tokens = pd.concat([unique_tokens_df, main_tokens_df])

all_tokens.drop('count',axis=1,inplace=True)
all_tokens['flag'] = all_tokens.flag.astype(int) #converting flags to int
tokens_dct = all_tokens.to_dict('split') #converting tokens_df to dictionary
tokens_dct=dict(tokens_dct['data']) #honestly can't remember why this works, something to do with conversion to dictionary

#preparing token_ids which will be used for joining left and right dfs
all_tokens.sort_values(by='flag',ascending=False,inplace=True)
all_tokens.sort_values(by='token',inplace=True)
all_tokens.drop_duplicates(subset='token',keep='first',inplace=True)
token_ids = all_tokens.index.get_level_values(0)
all_tokens['token_id'] = token_ids

all_tokens.drop('flag',axis=1,inplace=True)
all_tokens['token_id'] = all_tokens.token_id.astype(int)
token_id_dct = all_tokens.to_dict('split')
tokens_id_dct=dict(token_id_dct['data'])

vocabulary = np.array([w for w, c in tokens_dct.items() if c ==1]) #this works even without the ==1 and I don't know why
cv = CountVectorizer( vocabulary=vocabulary)

#now we are ready to tokenize left and right dataframes
all_left_cols = left_unique_token_columns + left_delta_token_columns

left_frame_list = []
for colname in all_left_cols:
    tokenmapping = cv.fit_transform(left_df[colname])
    df_row, token_id = tokenmapping.nonzero()

    left_frame_list.append(pd.DataFrame(np.vstack([vocabulary[token_id], left_df['id'].values[df_row]]).T, columns = ['token', 'id_l']))

left_keyed = pd.concat(left_frame_list)
left_keyed.drop_duplicates()#inplace=True)
#removing duplicates inplace was giving me a very strange issue where a small percentage of token_ids would be excluded from the left_keyed index

#append token_id to token as this will be more efficient to join with
left_token_ids = []
for token in left_keyed.token:
    left_token_ids.append(tokens_id_dct[token])

left_keyed['token_id'] = left_token_ids
left_keyed.sort_values(by='token_id',inplace=True)
left_keyed.set_index('token_id',inplace=True)
left_keyed.drop('token',axis=1,inplace=True)

left_keyed.sort_values(by='id_l',inplace=True)

all_right_cols = right_unique_token_columns + right_delta_token_columns

right_frame_list = []
for colname in all_right_cols:
    tokenmapping = cv.fit_transform(right_df[colname])
    df_row, token_id = tokenmapping.nonzero()

    right_frame_list.append(pd.DataFrame(np.vstack([vocabulary[token_id], right_df['id'].values[df_row]]).T, columns = ['token', 'id_r']))

right_keyed = pd.concat(right_frame_list)
right_keyed.drop_duplicates()#inplace=True)

#append token_id to token as this will be more efficient to join with
right_token_ids = []
for token in right_keyed.token:
    right_token_ids.append(tokens_id_dct[token])

right_keyed['token_id'] = right_token_ids
right_keyed.sort_values(by='token_id',inplace=True)
right_keyed.set_index('token_id',inplace=True)
right_keyed.drop('token',axis=1,inplace=True)

aggregations = {
    'id_l': 'count'
}

left_keyed.to_csv('left_keyed.csv')
bonus_point_tokens = []
for token in main_tokens_df[(main_tokens_df['count'] > 1) & (main_tokens_df['count']<=unique_token_freq_max)].token:
    bonus_point_tokens.append(tokens_id_dct[token])

match_dfs = []
for df in pd.read_csv('left_keyed.csv',keep_default_na=False,chunksize=50000,index_col='token_id'):
    
    joined = df.join(right_keyed, how='inner',lsuffix='_l',rsuffix='_r')
    joined['id_l']=joined.id_l.astype('str')
    #double-counting unique token matches

    intersection_bonus_tokens = set(bonus_point_tokens).intersection(set(list(joined.index)))    

    bonus_token_joins = []
    for token_id in intersection_bonus_tokens:
        bonus_token_joins.append(joined.loc[token_id])

    bonus_joins = pd.concat(bonus_token_joins)
    bonus_joins_cols = bonus_joins[['id_l','id_r']].copy()
    bonus_joins_cols.dropna(inplace=True)
    joined_bonus = pd.concat([joined,bonus_joins_cols])
    
    keys_grouped = joined_bonus.groupby(by=['id_l', 'id_r']).agg(aggregations)
    keys_grouped.rename(columns={'id_l':'id_l count'}, inplace=True)
    matched_records = keys_grouped[keys_grouped['id_l count'] >= token_match_min]
    matched_records.reset_index(inplace=True)
    match_dfs.append(matched_records)
    
all_match_candidates = pd.concat(match_dfs)
all_match_candidates.reset_index(inplace=True)
all_match_candidates.drop(labels='index',axis=1)
all_match_candidates.id_l.astype('str')

print("match candidates identified --- %s seconds ---" % (time.time() - start_time))

start_time = time.time()
left_df.rename(columns={'id':'id_l'},inplace=True)
right_df.rename(columns={'id':'id_r'},inplace=True)

left_match_data = left_df[['id_l','l_org_name','l_city','l_state','l_postal_code','l_domain','l_clean_phone']].copy()
right_match_data = right_df[['id_r','r_org_name','r_city','r_state','r_postal_code','r_domain','r_clean_phone']].copy()

#making sure keys are str, results in blank df otherwise
left_match_data.id_l = left_match_data.id_l.astype('str')
right_match_data.id_r = right_match_data.id_r.astype('str')
matched_records.id_l = matched_records.id_l.astype('str')
matched_records.id_r = matched_records.id_r.astype('str')

#merging matched_records df with original record data for ease of review
l_conc = pd.merge(all_match_candidates, left_match_data, on='id_l')
full_conc = pd.merge(l_conc, right_match_data, on='id_r')

print("original data concatenated with matches --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "SCORING ORG NAME SIMULARITY..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#scoring match candidates based on edit distance of org names
def jaro_simularity(left_record, right_record):
    if len(left_record) > 0 and len(right_record) > 0:
        if isinstance(left_record, numbers.Integral) is False and isinstance(right_record, numbers.Integral) is False:
            return jaro_winkler(unicode(left_record, 'utf-8') or '', unicode(right_record, 'utf-8') or '')
def fuzz_partial(left_record, right_record):
    if len(left_record) > 0 and len(right_record) > 0:
        if isinstance(left_record, numbers.Integral) is False and isinstance(right_record, numbers.Integral) is False:
            return fuzz.partial_ratio(unicode(left_record, 'utf-8') or '', unicode(right_record, 'utf-8') or '') / float(100)
def fuzz_sort(left_record, right_record):
    if len(left_record) > 0 and len(right_record) > 0:
        if isinstance(left_record, numbers.Integral) is False and isinstance(right_record, numbers.Integral) is False:
            return fuzz.token_sort_ratio(unicode(left_record, 'utf-8') or '', unicode(right_record, 'utf-8') or '') / float(100)
def fuzz_set(left_record, right_record):
    if len(left_record) > 0 and len(right_record) > 0:
        if isinstance(left_record, numbers.Integral) is False and isinstance(right_record, numbers.Integral) is False:
            return fuzz.token_set_ratio(unicode(left_record, 'utf-8') or '', unicode(right_record, 'utf-8') or '') / float(100)

full_conc['l_org_name'] = full_conc['l_org_name'].replace('', 'none', regex=True).astype('str')
full_conc['r_org_name'] = full_conc['r_org_name'].replace('', 'none', regex=True).astype('str')

jaro_time = time.time()
full_conc['jaro_score'] = full_conc.apply(lambda x: jaro_simularity(x.l_org_name, x.r_org_name), axis=1)
print("jaro scores done --- %s seconds ---" % (time.time() - jaro_time))
#jaro_reduced = full_conc[full_conc.jaro_score > .25] #I don't feel comfortable reducing candidates on a single metric
partial_time = time.time()
full_conc['fuzz_partial_score'] = full_conc.apply(lambda x: fuzz_partial(x.l_org_name, x.r_org_name), axis=1)
print("fuzz partial scores done --- %s seconds ---" % (time.time() - partial_time))
sort_time = time.time()
full_conc['fuzz_sort_score'] = full_conc.apply(lambda x: fuzz_sort(x.l_org_name, x.r_org_name), axis=1)
print("fuzz sort scores done --- %s seconds ---" % (time.time() - sort_time))
set_time = time.time()
full_conc['fuzz_set_score'] = full_conc.apply(lambda x: fuzz_set(x.l_org_name, x.r_org_name), axis=1)
print("fuzz set scores done --- %s seconds ---" % (time.time() - set_time))
print ""

print("name simularity scored --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "SCORING NAME SEQUENCE UNIQUENESS..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

org_tokens = []    
for word in left_df['l_org_name']:
    if isinstance(word, float) is False:
        org_tokens.append(tokenize_name(str(word)))

for word in right_df['r_org_name']:
    if isinstance(word, float) is False:
        org_tokens.append(tokenize_name(str(word)))
            
org_flat_list = [item for sublist in org_tokens for item in sublist]

#instantiate counter and use to count word frequencies in flat list
org_cnt = Counter()
for token in org_flat_list:
    org_cnt[token] += 1

org_cnt_dict = dict(org_cnt) #convert to dictionary

def sequence_uniqueness(seq):
    return sum(1/org_cnt_dict[str.lower(t)]**0.5 for t in seq)

def name_similarity(a, b):
    if isinstance(a,basestring) is True and isinstance(b,basestring) is True:
        if len(a) > 0 and len(b) > 0:
            a_tokens = set(tokenize_name(a))
            b_tokens = set(tokenize_name(b))
            a_uniq = sequence_uniqueness(a_tokens)
            b_uniq = sequence_uniqueness(b_tokens)

            return sequence_uniqueness(a_tokens.intersection(b_tokens))/(a_uniq * b_uniq) ** 0.5
        else:
            return 0
    else:
        return 0

full_conc['uniq'] = full_conc.apply(lambda x: name_similarity(x.l_org_name, x.r_org_name), axis=1)

print("name uniqueness scored --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "CHECKING FOR STATE CODE MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

def sanitize_state(state):
    if isinstance(state,basestring) is True:
        return ''.join(c for c in (state or '') if c in 'abcdefghijklmnopqrstuvwxyz')
    else:
        return ''
    
def state_match(state_a, state_b):
    sanitized_state_a = str(sanitize_state(state_a))
    sanitized_state_b = str(sanitize_state(state_b))

    # if the value is too short, means it's fubar
    if len(sanitized_state_a) < 2 or len(sanitized_state_b) < 2:
        return 0
    if state_a == state_b:
        return 1
    else:
        return 0    

full_conc['state_match'] = full_conc.apply(lambda x: state_match(x.l_state, x.r_state), axis=1)

print("state codes checked --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "CHECKING FOR POSTAL CODE MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#scoring match candidates based on matching postal code

def sanitize_postal(postal):
    if isinstance(postal, basestring) is True:
        return ''.join(c for c in (postal or '') if c in '1234567890')
    if isinstance(postal, float) is False:
        return postal

def postal_simularity(postal_a, postal_b):
    sanitized_postal_a = str(sanitize_postal(postal_a))
    sanitized_postal_b = str(sanitize_postal(postal_b))

    # if the number is too short, means it's fubar
    if len(sanitized_postal_a) < 5 or len(sanitized_postal_b) < 5:
        return 0
    if float(max(len(sub) for sub in find_common_subsequences(sanitized_postal_a, sanitized_postal_b))) / 5 >= 1:
        return 1
    else:
        return 0
    
full_conc['zip_match'] = full_conc.apply(lambda x: postal_simularity(x.l_postal_code, x.r_postal_code), axis=1)
    
print("postal codes checked --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "CHECKING FOR WEB DOMAIN MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

def domain_match(domain_a, domain_b):
    if isinstance(domain_a, basestring) is True and isinstance(domain_b, basestring) is True:
        if len(domain_a) > 0 and len(domain_b) > 0:
            if domain_a == domain_b:
                return 1
            else:
                return 0
        else:
            return 0
    else:
        return 0


full_conc['domain_match'] = full_conc.apply(lambda x: domain_match(x.l_domain, x.r_domain), axis=1)

print("web domains checked --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "CHECKING FOR PHONE MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#scoring match candidates based on matching phone
def phone_simularity(phone_a, phone_b):

    if len(phone_a) < 10 or len(phone_b) < 10:
        return 0
    elif float(max(len(sub) for sub in find_common_subsequences(phone_a, phone_b))) / 10 >= 1:
        return 1
    else:
        return 0
    
full_conc['phone_match'] = full_conc.apply(lambda x: phone_simularity(x.l_clean_phone, x.r_clean_phone), axis=1)
    
print("phones checked --- %s seconds ---" % (time.time() - start_time))
print ""

start_time = time.time()
print "COMPOSITE SCORING, PREDICTING MATCHES..." #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#full_conc['overall_name_score'] = full_conc.jaro_score * name_weight \
#+ full_conc.fuzz_partial_score * name_weight \
#+ full_conc.fuzz_sort_score * name_weight \
#+ full_conc.fuzz_set_score * name_weight \
#+ full_conc.uniq * name_uniqueness_weight

#calculate composite match score based on component scores and weights
#full_conc['composite_match_score'] = full_conc.overall_name_score \
#+ full_conc.zip_match * zip_weight \
#+ full_conc.state_match * state_weight \
#+ full_conc.domain_match * domain_weight \
#+ full_conc.phone_match * phone_weight

#use labeled matches to train logistic regression to predict matches
training_data = pd.read_table('training_data_utf.txt')

feature_cols = ['jaro_score',
                'fuzz_partial_score',
                'fuzz_sort_score',
                'fuzz_set_score',
                'uniq',
                'state_match',
                'zip_match']#,
                #'domain_match',
                #'phone_match']

# define X and y
X = training_data[feature_cols]
y = training_data['is_match']

# logistic regression
log = LogisticRegression()
log.fit(X, y)
y_pred_class = log.predict(full_conc[feature_cols])
y_pred_proba = log.predict_proba(full_conc[feature_cols])[:,1]

full_conc['match_pred'] = y_pred_class
full_conc['pred_proba'] = y_pred_proba

#we take any matches meeting either name match threshold or composite match threshold as matches for review
#org_matches = full_conc[(full_conc.overall_name_score >= name_score_min) | (full_conc.composite_match_score >= composite_score_min)]
org_matches = full_conc[full_conc.match_pred == 1]
failed_matches_for_review = full_conc[(full_conc.match_pred == 0) & (full_conc.pred_proba > .4)]

print("final matches isolated --- %s seconds ---" % (time.time() - start_time))
print ""

print("TOTAL COMPUTE TIME --- %s seconds ---" % (time.time() - overall_time))

org_matches.sort_values(by='pred_proba', ascending=True)

In [None]:
#use kfolds to test the accuracy of model using our training data
from sklearn import cross_validation

training_data = pd.read_table('training_data3.txt')
X = training_data[feature_cols]
y = training_data['is_match']

kf = cross_validation.KFold(len(X), n_folds=10, shuffle=True)

recall_scores = []
precision_scores = []
f1_scores = []
n= 0
print("~~~~ CROSS VALIDATION each fold ~~~~")
for train_index, test_index in kf:
    lm = LogisticRegression().fit(X.iloc[train_index], y.iloc[train_index])
    recall_scores.append(metrics.recall_score(y.iloc[test_index], lm.predict(X.iloc[test_index])))
    precision_scores.append(metrics.precision_score(y.iloc[test_index], lm.predict(X.iloc[test_index])))
    f1_scores.append(metrics.f1_score(y.iloc[test_index], lm.predict(X.iloc[test_index])))
    n+=1
    print('Model', n)
    print('Recall:', recall_scores[n-1])
    print('Precision:', precision_scores[n-1])
    print('F1-Score:', f1_scores[n-1])

print("~~~~ SUMMARY OF CROSS VALIDATION ~~~~")
print('Mean recall scores for all folds:', np.mean(recall_scores))
print('Mean precision scores for all folds:', np.mean(precision_scores))
print('Mean F1 scores for all folds:', np.mean(f1_scores))