In [3]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)
from fuzzywuzzy import fuzz, process
import string
import itertools
import numpy as np
import difflib

import matplotlib.pyplot as plt
%matplotlib inline



In [4]:
#df_orig = pd.read_csv('data/csv_example_input_with_true_ids.csv')
df_orig = pd.read_csv('data/csv_example_output.csv')

In [5]:
df_orig.shape

(3337, 66)

In [6]:
df_orig.head(2)

Unnamed: 0,Cluster ID,confidence_score,Id,Source,Site name,Address,Zip,Phone,Fax,Program Name,...,canonical_Center Director,canonical_Funded Enrollment,canonical_Column2,canonical_Number per Site EHS,canonical_Neighborhood,canonical_Progmod,canonical_IDHS Provider ID,canonical_Number per Site HS,canonical_Executive Director,canonical_Id
0,0,0.993148,0,CPS_Early_Childhood_Portal_scrape.csv,Salvation Army - Temple / Salvation Army,1 N Ogden Ave,,2262649.0,,Child Care,...,,salvation army,child care,,near west side,no,,,,1.0
1,0,0.993148,1,CPS_Early_Childhood_Portal_scrape.csv,Salvation Army - Temple / Salvation Army,1 N Ogden Ave,,2262649.0,,Child Care,...,,salvation army,child care,,near west side,no,,,,1.0


In [7]:
# Get entity blocks containing more than two records
record_cnt = df_orig.groupby('Cluster ID').size()
entity_ids = record_cnt[record_cnt > 2].index
# df_1 = df_orig.loc[df_orig['Cluster ID'].isin(entity_ids)].drop('confidence_score', axis=1)
df_1 = df_orig[df_orig.groupby('Cluster ID')['Cluster ID'].transform('size')>2]

### Records are merged into entity groups identified by Cluster ID

In [8]:
df_1.sort_values('Cluster ID').head(15)

Unnamed: 0,Cluster ID,confidence_score,Id,Source,Site name,Address,Zip,Phone,Fax,Program Name,...,canonical_Center Director,canonical_Funded Enrollment,canonical_Column2,canonical_Number per Site EHS,canonical_Neighborhood,canonical_Progmod,canonical_IDHS Provider ID,canonical_Number per Site HS,canonical_Executive Director,canonical_Id
0,0,0.993148,0,CPS_Early_Childhood_Portal_scrape.csv,Salvation Army - Temple / Salvation Army,1 N Ogden Ave,,2262649.0,,Child Care,...,,salvation army,child care,,near west side,no,,,,1.0
1226,0,0.993148,1226,CPS_Early_Childhood_Portal_scrape.csv,Salvation Army - Temple / Salvation Army,1 N Ogden Ave,,2262649.0,,State Pre-Kindergarten,...,,salvation army,child care,,near west side,no,,,,1.0
1225,0,0.993148,1225,CPS_Early_Childhood_Portal_scrape.csv,Salvation Army - Temple / Salvation Army,1 N Ogden Ave,,2262649.0,,State Pre-Kindergarten,...,,salvation army,child care,,near west side,no,,,,1.0
510,0,0.993148,510,CPS_Early_Childhood_Portal_scrape.csv,Salvation Army - Temple / Salvation Army,1 N Ogden Ave,,2262649.0,,Head Start,...,,salvation army,child care,,near west side,no,,,,1.0
509,0,0.993148,509,CPS_Early_Childhood_Portal_scrape.csv,Salvation Army - Temple / Salvation Army,1 N Ogden Ave,,2262649.0,,Head Start,...,,salvation army,child care,,near west side,no,,,,1.0
3255,0,0.969717,3255,purple_binder_early_childhood.csv,Salvation Army - Chicago Temple Corps Community Center,1 N Ogden Avenue,60607.0,2262649.0,,,...,,salvation army,child care,,near west side,no,,,,1.0
215,0,0.987749,215,CPS_Early_Childhood_Portal_scrape.csv,Salvation Army Temple,1 N. Ogden,,2262649.0,,Community Partnerships,...,,salvation army,child care,,near west side,no,,,,1.0
2758,0,0.985435,2758,ECE Chicago Find a School scrape.csv,Salvation Army Temple,1 N. Ogden,60607.0,2262649.0,,,...,,salvation army,child care,,near west side,no,,,,1.0
1879,0,0.999547,1879,chapin_dfss_providers_2011_070212.csv,SALVATION ARMY TEMPLE,1 N OGDEN,60640.0,2262649.0,,,...,,salvation army,child care,,near west side,no,,,,1.0
1,0,0.993148,1,CPS_Early_Childhood_Portal_scrape.csv,Salvation Army - Temple / Salvation Army,1 N Ogden Ave,,2262649.0,,Child Care,...,,salvation army,child care,,near west side,no,,,,1.0


### Customizable scoring fields

In [9]:
unique_comb = ['Site name','Address','Program Name']

In [10]:
def check_nan(df, cols):
    #print 'Dataframe total row count: ', df.shape[0]
    result = []
    for c in cols:
        if sum(df[c].isnull())>0:
            #print c , ' has ' , sum(df[c].isnull()) , 'null/NAN values'
            #nan_cnt += 1
            result.append(c)
    return result

In [11]:
def clean(my_string):
    # Remove punctuations
    # Convert to lower cases
    return my_string.lower()#.translate(None, string.punctuation)

### Hefty scoing work is done in this function
### -- Unique value
###     -------- horizontal/vertical
### -- Pairwise comparison
###     -------- horizontal/vertical

In [53]:
# Apply to each entity group
def confidence(entity_group, solution, cols, weights=[0.4,0.2,0.4], axis=0):
    '''
    This function takes in a resolved/merged entity group, and generate
    a confidence score to indicate the uniformness of records in the group
    Note:
    Can't mix the score generating mechanism
    unique_val is indicating a rough proportion of unique values
    pair is indicating the fuzzy match score
    
    INPUT:
    entity_group: one entity group of merged records, type: pandas dataframe
    solution: score generating mechanism, type: string
    cols: selected record fields for score generating, type: string list
    weights: only used when axis=1, reflecting how important a column is, type: list
    axis: indicating row-wise or column-wise comparison, type: int
    
    OUTPUT:
    score: the confidence score, ranging from 0 to 1
    
    Example:
    confidence(entity_group, 'unique_vals', unique_comb)
    confidence(entity_group, 'unique_vals', unique_comb, axis=1)
    confidence(entity_group, 'pair', unique_comb
    '''
    total_records = entity_group.shape[0]
    # check nan cols
    non_cols = check_nan(entity_group, cols)
    #fill nan cols
    for nc in non_cols:
        entity_group.loc[:,nc]=entity_group[nc].fillna('VIA_filled')
    
    # Use unique value counts to generate confidence score
    if solution == 'unique_vals':
        result = []
        # read row by row
        if axis == 0:
            for idx, row in entity_group.iterrows():
                comb_str = []
                # call normalize function on each column
                # get a final combined string for the row
                for c in cols:
                    comb_str.append(clean(row[c]))
                # append the string to result set
                #print ''.join(comb_str)
                result.append(''.join(comb_str))
            # Here we use 1-score to unify the score range with pairwise comparison
            # so that the higher the score is, the more confident we are
            #score = 1 - len(set(result))*1./entity_group.shape[0]
            score = 1 if len(set(result))==1 else 1 - len(set(result))*1./entity_group.shape[0]
            
        elif axis == 1:
            # this score is very sensitive because it takes the exact column value
            # when counting unique counts for EACH column
            score = 0
            uniques = []
            for i, c in enumerate(cols):
                # unique count vs total row count in one column
                # weights can be customized
                #print entity_group[c].values
                uniques.append(len(set(entity_group[c].apply(lambda x: clean(x)).values)))
                score += uniques[-1]*1./entity_group.shape[0]*weights[i]
            score = 1 if sum(uniques)==len(cols) else 1 - score
    # Pairwise Levenshtein distance score via fuzzywuzzy
    elif solution == 'pair':
        # Same as unique_val, combining all fields into one string
        # to be the comparison unit
        scores = []
        if axis == 0:
            result = []
            for idx, row in entity_group.iterrows():
                comb_str = []
                for c in cols:
                    comb_str.append(clean(row[c]))
                result.append(''.join(comb_str))
            # Get all possible record pairs
            for i, j in itertools.combinations(result, 2):
                # ratio = (total_length - edit_distance)/total_length
                score = difflib.SequenceMatcher(None, i, j).ratio()
                scores.append(score)
                
            #score = (sum(scores)/(1.*len(scores)))/100
        # Generate vector of per column comparison
        elif axis == 1:
            # initialize result vector
            final_vector = np.zeros(len(cols))
            # Get all possible record pairs
            pair_cnt = 0
            for i, j in itertools.combinations(entity_group[cols].values, 2):
                pair_cnt += 1
                col_comp_vector = []
                # pairwise compare each col
                for idx, c in enumerate(cols):
                    score = difflib.SequenceMatcher(None, i[idx],j[idx]).ratio()
                    col_comp_vector.append(score)
                scores.append(sum([x*y for x,y in zip(col_comp_vector, weights)]))
                #final_vector += col_comp_vector
            #score = sum((final_vector/100/pair_cnt)*weights)
        score = min(scores)
            
    elif solution == 'semi-pair':
        pass
    
    #return round(score, 2)
    return score

In [54]:
def score_assign(entity_group, new_col, func, args):
    entity_group[new_col] = func(entity_group, *args)
    return entity_group

In [55]:
# Can't mix the score generating mechanism
# unique val is indicating a rough proportion of unique values, the lower the better
# pair is indicating the fuzzy match score, the higher the better
print 'Confidence score by unique row: \n', confidence(df_1[df_1['Cluster ID']==17], 'unique_vals', unique_comb)
print '\n'
print 'Confidence score by unique columns: \n', confidence(df_1[df_1['Cluster ID']==17], 'unique_vals', unique_comb, axis=1)
print '\n'
print 'Confidence score by pairwise comp: ', confidence(df_1[df_1['Cluster ID']==17], 'pair', unique_comb)
print '\n'
print 'Confidence score col-by-col by pairwise comp: ', confidence(df_1[df_1['Cluster ID']==17], 'pair', unique_comb, axis=1)


Confidence score by unique row: 
0.0


Confidence score by unique columns: 
0.4


Confidence score by pairwise comp:  0.868131868132


Confidence score col-by-col by pairwise comp:  0.7


In [60]:
result_1 = df_1.groupby('Cluster ID').apply(score_assign, ('score1'), confidence, ['unique_vals', unique_comb])

In [61]:
result_2 = result_1.groupby('Cluster ID').apply(score_assign, ('score2'), confidence, ['unique_vals', unique_comb, [0.2, 0.4, 0.4],1])

In [62]:
result_3 = result_2.groupby('Cluster ID').apply(score_assign, ('score3'), confidence, ['pair', unique_comb, [0.2, 0.4, 0.4],1])

In [66]:
result_3.sort_values('Cluster ID')

Unnamed: 0,Cluster ID,confidence_score,Id,Source,Site name,Address,Zip,Phone,Fax,Program Name,...,canonical_Number per Site EHS,canonical_Neighborhood,canonical_Progmod,canonical_IDHS Provider ID,canonical_Number per Site HS,canonical_Executive Director,canonical_Id,score1,score2,score3
0,0,0.993148,0,CPS_Early_Childhood_Portal_scrape.csv,Salvation Army - Temple / Salvation Army,1 N Ogden Ave,,2262649.0,,Child Care,...,,near west side,no,,,,1.0,0.300000,0.560000,0.282258
1226,0,0.993148,1226,CPS_Early_Childhood_Portal_scrape.csv,Salvation Army - Temple / Salvation Army,1 N Ogden Ave,,2262649.0,,State Pre-Kindergarten,...,,near west side,no,,,,1.0,0.300000,0.560000,0.282258
1225,0,0.993148,1225,CPS_Early_Childhood_Portal_scrape.csv,Salvation Army - Temple / Salvation Army,1 N Ogden Ave,,2262649.0,,State Pre-Kindergarten,...,,near west side,no,,,,1.0,0.300000,0.560000,0.282258
510,0,0.993148,510,CPS_Early_Childhood_Portal_scrape.csv,Salvation Army - Temple / Salvation Army,1 N Ogden Ave,,2262649.0,,Head Start,...,,near west side,no,,,,1.0,0.300000,0.560000,0.282258
509,0,0.993148,509,CPS_Early_Childhood_Portal_scrape.csv,Salvation Army - Temple / Salvation Army,1 N Ogden Ave,,2262649.0,,Head Start,...,,near west side,no,,,,1.0,0.300000,0.560000,0.282258
3255,0,0.969717,3255,purple_binder_early_childhood.csv,Salvation Army - Chicago Temple Corps Community Center,1 N Ogden Avenue,60607.0,2262649.0,,VIA_filled,...,,near west side,no,,,,1.0,0.300000,0.560000,0.282258
215,0,0.987749,215,CPS_Early_Childhood_Portal_scrape.csv,Salvation Army Temple,1 N. Ogden,,2262649.0,,Community Partnerships,...,,near west side,no,,,,1.0,0.300000,0.560000,0.282258
2758,0,0.985435,2758,ECE Chicago Find a School scrape.csv,Salvation Army Temple,1 N. Ogden,60607.0,2262649.0,,VIA_filled,...,,near west side,no,,,,1.0,0.300000,0.560000,0.282258
1879,0,0.999547,1879,chapin_dfss_providers_2011_070212.csv,SALVATION ARMY TEMPLE,1 N OGDEN,60640.0,2262649.0,,VIA_filled,...,,near west side,no,,,,1.0,0.300000,0.560000,0.282258
1,0,0.993148,1,CPS_Early_Childhood_Portal_scrape.csv,Salvation Army - Temple / Salvation Army,1 N Ogden Ave,,2262649.0,,Child Care,...,,near west side,no,,,,1.0,0.300000,0.560000,0.282258
