#### To-do patches
- Shorten candidate selection during deduplication by not matching against the whole right side (since it would be redundant because that pair has already been checked)
- Matching should be able to consider dynamic columns, even if one column is not present on the other side - COMPLETE
- Add capability to consider multiple columns of the same type, for example home + employer address - COMPLETE
- Handle “future” dates for queening - COMPLETE
- Add matching on gender, perhaps derived by prefix if gender not available
- Add matching on position title
- Include/enable joining matches file to grouped on pair key - for duplicate identification - COMPLETE
- Include/enable joining matches file to grouped on pair key - for matching
- Suppress non-matching middle initials?  perhaps only when len is <=2 for both.
- Standardize country names, I dont think code does this at present (becuase we rely on transformer step)

# Prologue
### Setting things up

Here we have configurations.

First, we import all the libraries.

Second, we load and prepare any and all static lookup tables (nicknames, state codes, and personal domains). These lookup tables are set up so that any updates to the underlying CSV files will automatically incorporate the change the next time those chunks are run.

Third, we set up all the running parameters.

In [1]:
import pandas as pd
import numpy as np

import gc, math, numbers, operator, os, re, time
from collections import defaultdict, Counter
from statistics import stdev

from fuzzywuzzy import fuzz
from jellyfish import jaro_winkler, metaphone
from py_common_subseq import find_common_subsequences

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import networkx as nx

  from numpy.core.umath_tests import inner1d


In [2]:
"""
Prepare the nicknames lookup table

nicks will return all the possible nickname groups that a name belongs to.

To add more nicknames, add more data to the nicknames file and run this chunk
again. The file is not actually a data table (hence the sep='\n' instead of ',');
instead, every row represents a group of names that are considered equivalent.
"""
df_nicks = pd.read_csv('data/nicknames.csv', sep='\n', header=None, names=["names"])
df_nicks["names"] = df_nicks["names"].str.lower().str.split(',')

nicks = defaultdict(set)  # A dictionary with a default value of an empty set
df_nicks.apply(lambda row: list(map(lambda name: nicks[name].add(row.name), row["names"])),
               axis = "columns")
nicks;

In [3]:
"""
Prepare the places lookup table

To update or add more state to state acronyms or country name to
country acronyms, just add more rows to the respective csv files.
To be honest it doesn't even matter which file you add it into;
it all gets merged.
"""
# Get and lowercase the state codes data
df_states = pd.read_csv('data/state_lkup.csv', keep_default_na=False)
df_states = df_states.applymap(str.lower)
us_states = df_states[df_states.country=='us']
df_states = df_states.set_index("state")

# Get and lowercase the country codes data
df_countries = pd.read_csv('data/country codes.csv', keep_default_na=False)
df_countries = df_countries.applymap(str.lower)
df_countries = df_countries.set_index("COUNTRY")

# Combine the two lookup tables into a master lookup table
# See https://stackoverflow.com/a/26853961 for
# more info on merging dicts with {**x, **y}
place_acronyms = defaultdict(str,
                             {**df_states["acronym"].to_dict(),
                              **df_countries["ISO2"].to_dict()})
place_acronyms;

In [4]:
"""
Prepare the personal domains list

This just pulls out the list of all personal email domains.
"""
personal_domains = pd.read_csv('data/personal email domains.csv')['domain']
personal_domains;

In [5]:
def dictate_columns(*, index=None, updated_date = None, fname=None, lname=None, mname=None, prefix=None, person_suffix=None,
                    person_utility1=None,person_utility2=None,person_utility3=None,
                    position1=None,position2=None,position_metadata1=None,position_metadata2=None,
                    gender=None,
                    org_name=None, org_name2=None, org_name3=None,org_name4=None,org_name5=None,org_name6=None,
                    address1_a=None, address2_a=None, address3_a=None, city_a=None,state_a=None, zip_a=None, zip4_a=None, country_a=None, 
                    address_a_utility1=None, address_a_utility2=None, address_a_utility3=None, address_a_utility4=None, 
                    address_a_utility5=None, address_a_utility6=None, address_a_utility7=None, address_a_utility8=None,
                    address1_b=None, address2_b=None, address3_b=None, city_b=None,state_b=None, zip_b=None, zip4_b=None, country_b=None,
                    address_b_utility1=None, address_b_utility2=None, address_b_utility3=None, address_b_utility4=None, 
                    address_b_utility5=None, address_b_utility6=None, address_b_utility7=None, address_b_utility8=None,
                    address1_c=None, address2_c=None, address3_c=None, city_c=None,state_c=None, zip_c=None, zip4_c=None, country_c=None,
                    address_c_utility1=None, address_c_utility2=None, address_c_utility3=None, address_c_utility4=None, 
                    address_c_utility5=None, address_c_utility6=None, address_c_utility7=None, address_c_utility8=None,
                    address1_d=None, address2_d=None, address3_d=None, city_d=None,state_d=None, zip_d=None, zip4_d=None, country_d=None,
                    address_d_utility1=None, address_d_utility2=None, address_d_utility3=None, address_d_utility4=None, 
                    address_d_utility5=None, address_d_utility6=None, address_d_utility7=None, address_d_utility8=None,
                    email_a=None, email_b=None, email_c=None, email_d=None,
                    website_a=None, website_b=None, website_c=None, website_d=None,
                    phone_a=None, areacode_a=None, phone_number_a=None, extension_a=None,
                    phone_a_utility1=None,phone_a_utility2=None,phone_a_utility3=None,
                    phone_b=None, areacode_b=None, phone_number_b=None, extension_b=None,
                    phone_b_utility1=None,phone_b_utility2=None,phone_b_utility3=None,
                    phone_c=None, areacode_c=None, phone_number_c=None, extension_c=None,
                    phone_c_utility1=None,phone_c_utility2=None,phone_c_utility3=None,
                    phone_d=None, areacode_d=None, phone_number_d=None, extension_d=None,
                    phone_d_utility1=None,phone_d_utility2=None,phone_d_utility3=None,
                    fax_a=None, fax_areacode_a=None, fax_number_a=None,
                    fax_a_utility1=None,fax_a_utility2=None,fax_a_utility3=None,
                    utility1=None, utility2=None):
    """
    This function is a to ensure that no misspelling or mistaking of columns happen
    when defining which dataset-specific columns match to which general dataset columns.
    
    So by calling this function, Python will warn of any weird parameter names.

    As a last detail, these columns are used to determine which columns are important for
    scoring. So that's why nicks_groups is present. If it isn't, nicks_groups will be left
    out of the scoring section's dataframes and cause trouble.
    """
    return {
        'id': index,
        'updated_date': updated_date,
        'prefix':prefix,
        'fname': fname,
        'mname': mname,
        'lname': lname,
        'person_suffix':person_suffix, 
        'person_utility1': person_utility1,
        'person_utility2': person_utility2,
        'person_utility3': person_utility3,
        'gender': gender,
        'position1': position1,
        'position2': position2,
        'position_metadata1': position_metadata1,
        'position_metadata2': position_metadata2,
        'org_name': org_name,
        'org_name2':org_name2,
        'org_name3':org_name3,
        'org_name4':org_name4,
        'org_name5':org_name5,
        'org_name6':org_name6,
        'address1_a': address1_a,
        'address2_a': address2_a,
        'address3_a': address3_a,
        'city_a': city_a,
        'state_a': state_a,
        'zip_a': zip_a,
        'country_a': country_a,
        'address_a_utility1':address_a_utility1,
        'address_a_utility2':address_a_utility2,
        'address_a_utility3':address_a_utility3,
        'address_a_utility4':address_a_utility4,
        'address_a_utility5':address_a_utility5,
        'address_a_utility6':address_a_utility6,
        'address_a_utility7':address_a_utility7,
        'address_a_utility8':address_a_utility8,
        'address1_b': address1_b,
        'address2_b': address2_b,
        'address3_b': address3_b,
        'city_b': city_b,
        'state_b': state_b,
        'zip_b': zip_b,
        'country_b': country_b,
        'address_b_utility1':address_b_utility1,
        'address_b_utility2':address_b_utility2,
        'address_b_utility3':address_b_utility3,
        'address_b_utility4':address_b_utility4,
        'address_b_utility5':address_b_utility5,
        'address_b_utility6':address_b_utility6,
        'address_b_utility7':address_b_utility7,
        'address_b_utility8':address_b_utility8,
        'address1_c': address1_c,
        'address2_c': address2_c,
        'address3_c': address3_c,
        'city_c': city_c,
        'state_c': state_c,
        'zip_c': zip_c,
        'country_c': country_c,
        'address_c_utility1':address_c_utility1,
        'address_c_utility2':address_c_utility2,
        'address_c_utility3':address_c_utility3,
        'address_c_utility4':address_c_utility4,
        'address_c_utility5':address_c_utility5,
        'address_c_utility6':address_c_utility6,
        'address_c_utility7':address_c_utility7,
        'address_c_utility8':address_c_utility8,
        'address1_d': address1_d,
        'address2_d': address2_d,
        'address3_d': address3_d,
        'city_d': city_d,
        'state_d': state_d,
        'zip_d': zip_d,
        'country_d': country_d,
        'address_d_utility1':address_d_utility1,
        'address_d_utility2':address_d_utility2,
        'address_d_utility3':address_d_utility3,
        'address_d_utility4':address_d_utility4,
        'address_d_utility5':address_d_utility5,
        'address_d_utility6':address_d_utility6,
        'address_d_utility7':address_d_utility7,
        'address_d_utility8':address_d_utility8,
        'email_a': email_a,
        'email_b': email_b,
        'email_c': email_c,
        'email_d': email_d,
        'website_a': website_a,
        'website_b': website_b,
        'website_c': website_c,
        'website_d': website_d,
        'phone_a':phone_a,'areacode_a':areacode_a,'phone_number_a':phone_number_a,'extension_a':extension_a,
        'phone_a_utility1':phone_a_utility1,'phone_a_utility2':phone_a_utility2,'phone_a_utility3':phone_a_utility3,
        'phone_b':phone_b,'areacode_b':areacode_b,'phone_number_b':phone_number_b,'extension_b':extension_b,
        'phone_b_utility1':phone_b_utility1,'phone_b_utility2':phone_b_utility2,'phone_b_utility3':phone_b_utility3,
        'phone_c':phone_c,'areacode_c':areacode_c,'phone_number_c':phone_number_c,'extension_c':extension_c,
        'phone_c_utility1':phone_c_utility1,'phone_c_utility2':phone_c_utility2,'phone_c_utility3':phone_c_utility3,
        'fax_a':fax_a,'fax_areacode_a':areacode_a,'fax_number_a':phone_number_a,
        'fax_a_utility1':fax_a_utility1,'fax_a_utility2':fax_a_utility2,'fax_a_utility3':fax_a_utility3,
        'utility1':utility1,
        'utility2':utility2,
        'nicks_groups': None,
    }

chunk_size = 1_000_000     # How many rows fit in memory at a time
small_chunk_size = 100_000 # Use for cells that need memory help
token_match_min = 2        # Min number of matched tokens to be considered a match candidate
token_limiter = .995        # Top x% of scarcest non-single tokens to keep (aka drop the most common (1-x)% tokens)
unique_token_freq_max = 5  # Max occurences of a token to be considered "unique" and be double-counted
reduce_threshold = .6     # this is used to decide the line where we cull unlikely matches from being scored further (do not reduce to 0 as we use 0 to suppress, but you could go with '.01' if you want to be maximally inclusive)
intermediary_dir = 'build' # Where intermediary files will be held
match_threshold = .5      # Threshold for considering a match a match when creating composite score
group_threshold = .5      # Threshold for a valid grouping

#if you set this higher than 1 it may cause error during dedup score calculation at the end - need to resolve this
group_merge_limiter = 1   #discourages two groups from merging; set to 1 to ignore. useful if you have reason to separate groups

organization_only = False   # Ignore contact info; only focus on organization columns and features
compress = False #set to True to force compression
entity_ID = False #set to True for resolution of duplicates with multiple passes of grouping

#if dataset includes gender column, define male and female values
male_value = 'Male' 
female_value = 'Female'

# Columns that, if present, will be used to collect tokens
#NOTE - at present whether token type appears in person_token_columns or org_token_columns has no impact on candidate discovery
#what DOES make a difference is where the features are considered as part of prediction step (for person or org model)
person_token_columns = [
    'fname',
    'meta_fname',
    'nicks_groups',
    'lname',
    'meta_lname',
    'before_domain_a','before_domain_b','before_domain_c','before_domain_d', #derived from email
    'mname',
]

org_token_columns = [
    'org_name','org_name2','org_name3',
    'address1_a','city_a','state_a','zip_a',#'country_a',
    'address1_b','city_b','state_b','zip_b',#'country_b',
    'address1_c','city_c','state_c','zip_c',#'country_c',
    'address1_d','city_d','state_d','zip_d',#'country_d',
    'clean_phone_a','clean_phone_b','clean_phone_c','clean_fax_a',
    'email_domain_a','email_domain_b','email_domain_c','email_domain_d',
    'web_domain_a','web_domain_b','web_domain_c','web_domain_d',
    'utility1',
    'utility2'
]

full_token_columns = person_token_columns + org_token_columns

# If we're doing organization only, then we don't care about
# person_token_columns and in fact want to treat it like they
# don't exist at all, so we overwrite it here to be nothing.
if organization_only: 
    person_token_columns = []
    #org_token_columns.append('clean_phone')

### Dataset Details

Here, the parameters for every dataset that goes into this program is set up. The `data_details` variable is a list of options that gets fed to `pd.read_csv`, and `cols` is a dictionary generated by `dictate_columns` that holds the names of all the columns.

A couple are defined, one after the other. The last one overwrites the others, so only the last one is in effect. The other ones I leave as a record.

#### Golden record creation notes

Because reconciliation of duplicate records for the creation of golden records entails more detail than what is strictly required for duplicate identification, we should flesh out this step to the maximum degree.  The golden record code inherits a file containing all column mappings (to avoid having to do this step a second time).  This is why there are many "_utility" columns.  You should map ALL columns that should be considered together when merging duplicate records.  

For example, if there is an address_id column, we'd like to bring that along with its corresponding address details.  It should be mapped as a address_X_utlity column.  Similarly if there are any ancillary fields associated with basic contact data (e.g. nicknames, alternative organization names, category codes associated with an email column, etc.) these should be mapped.  Note also: Job titles should be mapped even though they are not considered for data matching.  

But you do NOT need to map any columns that were created during the transformer or infogroup (data axle) processor steps.  This is becasue the golden record code can identify these columns based on the known naming convention.

In [6]:
"""
Define the metadata for dataset
"""
data_details = {
    "filepath_or_buffer": "data/A3/bb_clean_w_infogroup w queening.csv",
    "sep": ",",
    'nrows':5_000
}
cols = dictate_columns(
    index='u_id', #don't recommend using an index column containing leading zeroes, ex. '00134'
    #updated_date='(Do Not Modify) Modified On',
    #
    org_name='PrimaryOrganizationName',
    #org_name2='Account Number (Parent Customer) (Account)', #note: include fields which can be generally prioritized based on count of characters
    #org_name3='X',
    #org_name4='X',
    #org_name5='X',
    #org_name6='X',
    #
    prefix='NamePrefix',
    fname='FirstName',
    mname='MiddleName',
    lname='LastName',
    #person_suffix='Suffix',
    person_utility1='InformalName', #include columns that should be married to corresponding name, like "nickname"
    #person_utility2='Birthday',
    #person_utility3='X',
    #
    #gender='Gender',
    #
    position1='PrimaryOrganizationTitle',
    #position2='X',
    #
    address1_a='AddressLines',
    address2_a='CarrierRoute',
    #address3_a='MailingState',
    city_a='CityName',
    state_a='CountrySubEntityCode',
    zip_a='PostalCode',
    country_a='CountryName',
    #address_a_utility1='Address 1 Country Code',
    #address_a_utility2='Address 1: Post Office Box',
    #address_a_utility3='PREFERRED_MAIL_directory',
    #address_a_utility4='PREFERRED_BILL_directory',
    #address_a_utility5='PREFERRED_SHIP_directory',
    #address_a_utility6='TIME_STAMP_directory',
    #address_a_utility7='ADDRESS_3_directory',
    #address_a_utility8='ADDRESS_3_directory',
    #
    #address1_b='ADDRESS_1_secondary',
    #address2_b='ADDRESS_2_secondary',
    #address3_b='ADDRESS_3_secondary',
    #city_b='CITY_secondary',
    #state_b='STATE_PROVINCE_secondary',
    #zip_b='ZIP_secondary',
    #country_b='COUNTRY_secondary',
    #address_b_utility1='ADDRESS_NUM_secondary',
    #address_b_utility2='BAD_ADDRESS_secondary',
    #address_b_utility3='PREFERRED_MAIL_secondary',
    #address_b_utility4='PREFERRED_BILL_secondary',
    #address_b_utility5='PREFERRED_SHIP_secondary',
    #address_b_utility6='TIME_STAMP_secondary',
    #address_b_utility7='ADDRESS_3_directory',
    #address_b_utility8='ADDRESS_3_directory',
    #
    #address1_c='ADDRESS_1_personal',
    #address2_c='ADDRESS_2_personal',
    #address3_c='ADDRESS_3_personal',
    #city_c='CITY_personal',
    #state_c='STATE_PROVINCE_personal',
    #zip_c='ZIP_personal',
    #country_c='COUNTRY_personal',
    #address_c_utility1='ADDRESS_NUM_personal',
    #address_c_utility2='BAD_ADDRESS_personal',
    #address_c_utility3='PREFERRED_MAIL_personal',
    #address_c_utility4='PREFERRED_BILL_personal',
    #address_c_utility5='PREFERRED_SHIP_personal',
    #address_c_utility6='TIME_STAMP_personal',
    #address_c_utility7='ADDRESS_3_directory',
    #address_c_utility8='ADDRESS_3_directory',
    #
    phone_a='Phone',
    #phone_a_utility1 = 'x',
    #phone_a_utility2 = 'x',
    #phone_a_utility3 = 'x',
    phone_b='Phones_Number',
   # phone_b_utility1 = 'Phone 1 Country',
    #phone_b_utility2 = 'Phone 1 Country Code',
    #phone_b_utility3 = 'x',
    phone_c='Phones_Number2',
    #phone_c_utility1 = 'x',
    #phone_c_utility2 = 'x',
    #phone_c_utility3 = 'x',
    #fax_a='fax',
    #fax_a_utility1 = 'Fax Country',
    #fax_a_utility2 = 'Fax Country Code',
    #fax_a_utility3 = 'x',
    #
    website_a='WebsiteUrl',
    #website_b='Personal Website',
    #
    email_a='Email',
    email_b='EmailsAddress',
    email_c='EmailsAddress2',
    #
    #utility1='CO_ID'
)


Finally, if we're looking for matches, we want to define the dataset details for the second dataset. If these are not defined, then the program will act as a deduplicator. If so, then it will find matches.

Essentially, the program will check for the existence of the `data_alt_details` and `cols_alt` variables.

In [7]:
#note: this code is for reference in MATCHING projects only.  Not relevant to deduplication.
#to MATCH between two datasets, include alt data details below
#and make sure to #comment out the two del lines at bottom of this cell
#script checks for existence of data_alt_details & cols_alt in order to determine whether this is a matching exercise
'''
Define the metadata for the engy_prospects dataset

This is defined as a find matches dataset. So in effect
it's still just deduplication, but this will allow us to
test out the match finding capabilities, and doublecheck
them against the deduplication function.
'''

data_alt_details = {
    "filepath_or_buffer": "data/SF & Cupola data match/all cupola orgs.csv",
    "sep": ",",
    #"nrows": 1_000,
}
cols_alt = dictate_columns(
    index='organization_id', #don't recommend using an index column containing leading zeroes, ex. '00134'
    #updated_date='(Do Not Modify) Modified On',
    #
    org_name='name',
    org_name2='acronym', #note: include fields which can be generally prioritized based on count of characters
    org_name3='Alt_Names',
    #org_name4='X',
    #org_name5='X',
    #org_name6='X',
    #
    #prefix='PREFIX',
    #fname='FIRST_NAME',
    #mname='MIDDLE_NAME',
    #lname='LAST_NAME',
    #person_suffix='SUFFIX',
    #person_utility1='Nickname', #include columns that should be married to corresponding name, like "nickname"
    #person_utility2=' Full Name',
    #person_utility3='X',
    #
    #gender='X',
    #
    #position1='Job Title',
    #position2='X',
    #
    address1_a='address1',
    address2_a='address2',
    #address3_a='ADDRESS_3_directory',
    city_a='city',
    state_a='state',
    zip_a='postal_code',
    #country_a='COUNTRY_directory',
    #address_a_utility1='ADDRESS_NUM_directory',
    #address_a_utility2='BAD_ADDRESS_directory',
    #address_a_utility3='PREFERRED_MAIL_directory',
    #address_a_utility4='PREFERRED_BILL_directory',
    #address_a_utility5='PREFERRED_SHIP_directory',
    #address_a_utility6='TIME_STAMP_directory',
    #address_a_utility7='ADDRESS_3_directory',
    #address_a_utility8='ADDRESS_3_directory',
    #
    #address1_b='ADDRESS_1_secondary',
    #address2_b='ADDRESS_2_secondary',
    #address3_b='ADDRESS_3_secondary',
    #city_b='CITY_secondary',
    #state_b='STATE_PROVINCE_secondary',
    #zip_b='ZIP_secondary',
    #country_b='COUNTRY_secondary',
    #address_b_utility1='ADDRESS_NUM_secondary',
    #address_b_utility2='BAD_ADDRESS_secondary',
    #address_b_utility3='PREFERRED_MAIL_secondary',
    #address_b_utility4='PREFERRED_BILL_secondary',
    #address_b_utility5='PREFERRED_SHIP_secondary',
    #address_b_utility6='TIME_STAMP_secondary',
    #address_b_utility7='ADDRESS_3_directory',
    #address_b_utility8='ADDRESS_3_directory',
    #
    #address1_c='ADDRESS_1_personal',
    #address2_c='ADDRESS_2_personal',
    #address3_c='ADDRESS_3_personal',
    #city_c='CITY_personal',
    #state_c='STATE_PROVINCE_personal',
    #zip_c='ZIP_personal',
    #country_c='COUNTRY_personal',
    #address_c_utility1='ADDRESS_NUM_personal',
    #address_c_utility2='BAD_ADDRESS_personal',
    #address_c_utility3='PREFERRED_MAIL_personal',
    #address_c_utility4='PREFERRED_BILL_personal',
    #address_c_utility5='PREFERRED_SHIP_personal',
    #address_c_utility6='TIME_STAMP_personal',
    #address_c_utility7='ADDRESS_3_directory',
    #address_c_utility8='ADDRESS_3_directory',
    #
    phone_a='Phone',
    #phone_b='PHONE_personal',
    #phone_c='Home Phone',
    #fax_a='Fax',
    #
    website_a='Website',
    #website_b='Personal Website',
    #
    #email_a='EMAIL_directory',
    #email_b='EMAIL_personal',
    #
    #utility1='Grantor_ID_for token'
)

##### Heads up running this code:
#
# I fixed a bug where even during matching mode
# the candidate selection would filter out identical IDs
# but if we're matching we don't want to assume that they're
# the same. So that code is now under a `if not matching:`
# block. This means: Running this code will match every
# row to every row and take forever to run so uh don't run it.
#
# I did it while it was still broken and it worked because
# it was a roundabout deduplication so I could check that it
# works. Now though only use it for actual matching not dedups.

# Right now I want to run dedup but I don't want to delete
# the code, so I'll just erase the variables instead.

##################################################################
del data_alt_details
del cols_alt
##################################################################

In [8]:
"""
Create a flag for whether we are matching.
"""

try:
    data_alt_details
    cols_alt
except NameError:
    # If NameError, then the variables don't exist
    # and we're not in a matching mode
    matching = False
else:
    # If try goes through fine, then these variables
    # do exist and we're matching
    matching = True

In [9]:
#create string converters where appropriate
try:
    string_cols_df_filepath = '/'.join(data_details['filepath_or_buffer'].split('/')[:-1]) + '/infogroup_processor_string_columns_df.csv'
    string_columns_df = pd.read_csv(string_cols_df_filepath)
    string_columns = list(string_columns_df.column)
    converters = {col: str for col in string_columns}
    data_details.update({'converters':converters})
except:
    data_details.update({'converters':None})
if matching:
    try:
        string_columns_alt = list(pd.read_csv('/'.join(data_alt_details['filepath_or_buffer'].split('/')[:-1]) + '/infogroup_processor_string_columns_df.csv').column)
        converters_alt = {col: str for col in string_columns_alt}
        data_alt_details.update({'converters':converters_alt})
    except:
        data_alt_details.update({'converters':None})
    

In [10]:
'''#don't think we need but this code is handy
#creating an inverse dictionary to return to original column names later
inv_col_dict = {d: c for c, d in cols.items()}
if matching:
    inv_alt_col_dict = {d: c for c, d in cols_alt.items()} #do I need to handle _l and _r suffixes?
'''

"#don't think we need but this code is handy\n#creating an inverse dictionary to return to original column names later\ninv_col_dict = {d: c for c, d in cols.items()}\nif matching:\n    inv_alt_col_dict = {d: c for c, d in cols_alt.items()} #do I need to handle _l and _r suffixes?\n"

### Building intermediary directory

This chunk checks the existence of the directory to hold all the intermediary files. It then creates a subdirectory within that directory to hold all the files for *this* dataset on *this* run.

Finally, it creates the `ns` function, that takes a filename and returns a filename for that file within the proper directory. The `ns` stands for `namespace`.

In [11]:
"""
Override sub-directory for debugging purposes if necessary.

If for example, I'm coming back to the program after a break or
some restart of Jupyter, I can manually set folder to the subfolder
I want (e.g. "dataset.csv" or "orgfile.tsv3" or something) instead of
creating a new subfolder.
"""

try:
    folder
except NameError:
    print("No overriding; will create new subdirectory")
else:
    print("Overriding subdirectory selection: Using subdirectory", folder)

No overriding; will create new subdirectory


In [12]:
"""
Set parameters
"""

try:
    os.mkdir(intermediary_dir)
    print("Directory", intermediary_dir, "created") 
except FileExistsError:
    print("Directory", intermediary_dir, "found")


try:
    folder
except NameError:
    # The subfolder name will just be the name of the datafile. If this is
    # a matching task, then it'll be "datasetA.csv_datasetB.csv"
    namespace = data_details['filepath_or_buffer'].split("/")[-1]
    if matching:
        namespace += "_" + data_alt_details['filepath_or_buffer'].split("/")[-1]

    unique = ""
    while True:
        try:
            os.mkdir(os.path.join(intermediary_dir, namespace + str(unique)))
            print("Sub-directory", namespace + str(unique), "created")
            break
        except FileExistsError:
            # If a subfolder already exists, start tacking on numbers.
            # name -> name1 -> name2 -> ...
            unique = 1 if unique == "" else unique + 1

    def ns(file): return os.path.join(intermediary_dir, namespace + str(unique), file)
else:
    print("Using existing sub-directory", folder)
    def ns(file): return os.path.join(intermediary_dir, folder, file)

Directory build found
Sub-directory bb_clean_w_infogroup w queening.csv created


In [13]:
def process_chunks(chunks, chunk_func, outfile=None, silent=False, **kwargs):
    """
    The process_chunks function takes a generator of dataset chunks (created from
    pd.read_csv(chunksize=X)) and a chunk_func that runs on that dataset, and runs
    it for each chunk, writing the result to file.
    
    So it reads chunks, applies the function to each chunk, then optionally writes
    the results to a new file `outfile` for further processing.
    
    **kwargs basically allows the caller to include any other keyword arguments, and
    they will all be passed to the to_csv function. So mostly things like `index=False`.
    """
    start_time = time.time()
    total_rows = 0
    needs_init = True
    for count, chunk in enumerate(chunks):
        local_time = time.time()

        # Some preprocessing that needs to be done every read_csv
        chunk = chunk.fillna('')

        processed = chunk_func(chunk)
        
        if outfile:
            # mode=w overwrites the file, which we want on initial write
            # mode=a appends to the end of file, which we want as we process more chunks
            # header=True puts a header on the top, which we want on initial write
            # header=False omits the header, which we want as we add more chunks to the existing file

            default_args = {'mode': 'w' if needs_init else 'a', 'header': needs_init, 'compression': compression}
            # **dict(default, **kwargs) creates a new dictionary of arguments. It basically combines
            # the defaults with the passed in arguments. It also allows the keyword arguments to override
            # any of the default arguments.
            #
            # Note: function(**{'a': 1, 'b': 2, 'c': 3}) == function(a=1, b=2, c=3)
            processed.to_csv(outfile, **dict(default_args, **kwargs))
            needs_init = False

        total_rows += len(chunk)

        if not silent:
            print("Processed chunk", count,
                  round(time.time() - local_time, 2), "seconds")
    if not silent:
        print("Finished in", round(time.time() - start_time, 2), "seconds")

In [14]:
#saving forward the postal type dict for later use in next step, golden record creation
postal_type_dict_filepath = '/'.join(data_details['filepath_or_buffer'].split('/')[:-1]) + '/postal_type_dict.csv'
try:
    postal_type_dict = pd.read_csv(postal_type_dict_filepath) 
    postal_type_dict.to_csv(ns('postal_type_dict.csv'),index=False)
except:
    postal_type_dict_filepath

# Dataset Preparation

### Do dataset-wide standardization
Standardize the dataset by globally lowercasing and filling in nulls. Then, for applicable columns, preprocess them to be fit for tokenizing and for scoring.

In [15]:
#deal with utf-8 encoding
import io
import shutil

with io.open(data_details['filepath_or_buffer'], encoding='utf-8', errors='ignore') as source:
    with io.open('data/data1_utf.csv', mode='w', encoding='utf-8') as target:
        shutil.copyfileobj(source,target)
        
data_details.update({'filepath_or_buffer':'data/data1_utf.csv'})

if matching:
    with io.open(data_alt_details['filepath_or_buffer'], encoding='utf-8', errors='ignore') as source:
        with io.open('data/data2_utf.csv', mode='w', encoding='utf-8') as target:
            shutil.copyfileobj(source,target)
        
    data_alt_details.update({'filepath_or_buffer':'data/data2_utf.csv'})

FileNotFoundError: [Errno 2] No such file or directory: 'data/A3/bb_clean_w_infogroup w queening.csv'

In [16]:
data_details['filepath_or_buffer']

'data/A3/bb_clean_w_infogroup w queening.csv'

In [None]:
#define compression variable based on input data size
if compress is True:
    compression = 'gzip'
else:
    try:
        data_details['nrows']
        compression = 'infer'
    except:
        if matching:
            df1 = pd.read_csv(data_details['filepath_or_buffer'],sep=data_details['sep'])
            df2 = pd.read_csv(data_alt_details['filepath_or_buffer'],sep=data_alt_details['sep'])
            if (df1.shape[0] >= 150_000) | (df2.shape[0] >= 150_000):
                compression = 'gzip'
            else:
                compression = 'infer'
            df1=pd.DataFrame() #clearing memory
            df2=pd.DataFrame()
        else:
            df = pd.read_csv(data_details['filepath_or_buffer'],sep=data_details['sep'])
            if df.shape[0] >= 150_000:
                compression = 'gzip'
            else:
                compression = 'infer'
            df=pd.DataFrame() #clearing memory
        
compression

In [None]:
cols

In [None]:
def init_data(data_details, cols, out_file="data.csv"):
    start_time = time.time()
    print ("LOADING INITIAL DATAFRAMES...")

    def _init_data(df):
        # When renaming, invert the columns dictionary to use the keys as the standard 
        df = df.rename({column_name: standard_name for standard_name, column_name in cols.items()},
                       axis="columns")
        df = df.dropna(how='all')                    # Drop all empty rows
        df = df.set_index('id')                      # Make ID the index to the data
        df = df[~df.index.duplicated(keep='first')]  # Drop all duplicate ID's (ID's assumed unique)
        df = df.fillna('')                           # Make any NA an empty string
        df = df.applymap(str.lower)                  # Lowercase all fields
        df = df.applymap(str.strip)                  # Strip whitespace from all fields

        return df

    process_chunks(pd.read_csv(**data_details, chunksize=chunk_size,encoding='utf-8',
                               error_bad_lines=False, dtype=object),
                   _init_data,
                   ns(out_file))

    print("Dataframe loaded --- %s seconds ---" % (time.time() - start_time))

if matching:
    init_data(data_details, cols, "data_l.csv")
    init_data(data_alt_details, cols_alt, "data_r.csv")
else:
    init_data(data_details, cols)

In [None]:
a = 0

def preprocess(in_file="data.csv", out_file="processed.csv", match_items=cols):
    start_time = time.time()
    print ("PRE-PROCESSING...")

    def _preprocess(df):
        ################QUEEN SCORE CALCULATIONS###############        
        cbis_valid_cols = []
        email_val_cols = []
        postal_val_cols = []
        coa_cols = []
        for col in df.columns:
            if str(col)[-6:] == 'Valid?':
                cbis_valid_cols.append(col)
            if str(col)[-10:] == 'Risk Score':
                email_val_cols.append(col)
            if 'Mailability_Score' in col:
                postal_val_cols.append(col)
            if 'Change of Address?' in col:
                coa_cols.append(col)
        
        if 'recent_activity_date' in df:
            queen_date_cols = []
            for col in df.columns:
                if 'recent_activity_date' in col:
                    queen_date_cols.append(col)
            
            import dateutil.parser
            from datetime import date
            today = date.today()
            today_days = (50 * 365) + (today.month * 30) + today.day #surely 50 years is enough
            
            n=1
            for col in queen_date_cols:
                delta_days_col = 'delta_days' + str(n)
                z_score_col = 'z_score_delta_days' + str(n)
                delta_days = [] #find number of days between today's date and most recent transaction as proportion of total days
                for date in df[col]:
                    try:
                        delta_days.append((today - dateutil.parser.parse(date).date()).days / today_days)
                    except:
                        delta_days.append(1) #high values are penalized as this means it is an OLD transaction date
                df[delta_days_col] = delta_days
                '''
                #calculating the second oldest value (basically the oldest value that's not a NULL)
                transaction_cnt_counts = pd.DataFrame(pd.DataFrame(delta_days,columns=['transaction_cnt_proportion'])['transaction_cnt_proportion'].value_counts())
                transaction_cnt_counts.sort_values(by='transaction_cnt_proportion',ascending=False,inplace=True)
                transaction_cnt_counts.reset_index(inplace=True)
                #now replacing 1s with oldest value +.001 to soften the scores
                delta_days = [x if x < max(delta_days) else transaction_cnt_counts['index'][1] + .001 for x in delta_days]
                df['delta_days2'] = delta_days
                '''
                #calculating z score
                delta_day_values = df[df[delta_days_col]<1][delta_days_col]
                sd_delta_days = stdev(delta_day_values)
                avg_delta_days = sum(delta_day_values) / len(delta_day_values)
                z_delta_days = []
                for v in df[delta_days_col]:
                    if v == 1:
                        z_delta_days.append(0)
                    else:
                        z_delta_days.append(((avg_delta_days - v) / sd_delta_days) + (5-n))
                df[z_score_col] = z_delta_days
                n=n+1
        else:
            df['z_score_delta_days1'] = 0
        
        if 'z_score_delta_days2' not in df: #add this in case it was not just created
            df['z_score_delta_days2'] = 0
            
        #handle date columns which are "future" dates, for example 'Membership_expire_date'
        if 'future_date' in df:
            future_date_cols = []
            for col in df.columns:
                if 'future_date' in col:
                    future_date_cols.append(col)
        
            import dateutil.parser
            from datetime import date
            today = date.today() #use to calculate number of days difference from today vs future date
                
            n=1
            for col in future_date_cols:
                delta_future_days_col = 'delta_future_days' + str(n)
                z_score_col = 'z_score_delta_future_days' + str(n)
                delta_future_days =[]
                for date in df[col]:
                    try:
                        delta_future_days.append(int((dateutil.parser.parse(date).date() - today).days))
                    except:
                        delta_future_days.append(0)
                df[delta_future_days_col] = delta_future_days     
                #calculating z score
                delta_future_day_values = df[df[delta_future_days_col] != 0][delta_future_days_col]
                sd_delta_future_days = stdev(delta_future_day_values)
                avg_delta_future_days = sum(delta_future_day_values) / len(delta_future_day_values)
                z_delta_future_days = []
                for v in df[delta_future_days_col]:
                    if v == 0:
                        z_delta_future_days.append(0)
                    else:
                        z_delta_future_days.append(((v-avg_delta_future_days) / sd_delta_future_days) + (3-n))
                df[z_score_col] = z_delta_future_days
                n=n+1   
        else:
            df['z_score_delta_future_days1'] = 0
            
        if 'z_score_delta_future_days2' not in df:
            df['z_score_delta_future_days2'] = 0
            
        if 'transaction_count' in df:
            df['transaction_count'] = df['transaction_count'].fillna(0).replace('',0).astype(int)
            #df['transaction_count'] = df['transaction_count'].astype(int)
            max_transactions = df.transaction_count.max(skipna=True)
            transactions = []
            for t in df.transaction_count:
                try:
                    transactions.append(t / max_transactions)
                except:
                    transactions.append(0)
            df['transaction_score'] = transactions
            transaction_score_values = df[df['transaction_score']>0]['transaction_score']
            sd_transactions = stdev(transaction_score_values)
            avg_transactions = sum(transaction_score_values) / len(transaction_score_values)
            z_transactions = []
            for v in df['transaction_score']:
                if v == 0:
                    z_transactions.append(0)
                else:
                    z_transactions.append(((v - avg_transactions) / sd_transactions) + 5)
            df['z_score_transactions'] = z_transactions
        else:
            df['z_score_transactions'] = 0

        #summing any validity penalties from invalid data (postal addresses, bad domains, etc.)
        valid_penalties = []
        for index, row in df[cbis_valid_cols].iterrows():
            row_valid_score = 0
            for col in cbis_valid_cols:
                try:
                    if row[col] == 'invalid':
                        row_valid_score = row_valid_score -1
                except:
                    row_valid_score
            valid_penalties.append(row_valid_score)
        df['valid_penalties'] = valid_penalties  
        
        #summing validity scores for emails
        e_lkup = {1:1,2:.5,3:0,4:-2} #I think I don't want a 2 and a 3 together to beat a 1 by itself
        email_scores = []
        for index, row in df[email_val_cols].iterrows():
            email_val_score = 0
            for col in email_val_cols:
                try:
                    email_val_score = email_val_score + e_lkup[int(row[col][0])]
                except:
                    email_val_score
            email_scores.append(email_val_score)
        df['email_scores'] = email_scores
        
        #summing validity scores for postal addresses
        p_lkup = {1:1,2:.7,3:.25,4:-.5,5:-2} #I think I don't want a 2 and a 3 together to beat a 1 by itself
        postal_scores = []
        for index, row in df[postal_val_cols].iterrows():
            postal_val_score = 0
            for col in postal_val_cols:
                try:
                    postal_val_score = postal_val_score + p_lkup[int(row[col][0])]
                except:
                    postal_val_score
            postal_scores.append(postal_val_score)
        df['postal_scores'] = postal_scores
        
        #change of address penalties
        coa_scores = []
        for index, row in df[coa_cols].iterrows():
            coa_val_score = 0
            for col in coa_cols:
                try:
                    if row[col] == 'y':
                        coa_val_score = coa_val_score - .5
                except:
                    coa_val_score
            coa_scores.append(coa_val_score)
        df['coa_scores'] = coa_scores
        
        #use whatever columns we are considering for matching as queen columns - columns we use to assess completeness
        #or, should we be using ALL columns?
        queen_cols = []
        cols_df = pd.DataFrame(list(match_items.items()),columns=['key','value'])
        for k, v in zip(cols_df.key,cols_df.value):
            if (v != None): #& (k != v): #we previously had & (k != v) but I'm not sure why, removing this
                queen_cols.append(k)
        queen_cols.remove('id')
        
        chars_totals = []
        for index, row in df[queen_cols].iterrows():
            chars = 0
            for col in queen_cols:
                try:
                    chars = chars + len(row[col])
                except:
                    chars
            chars_totals.append(chars * .001)
        df['chars'] = chars_totals
        
        field_counts = []
        for index, row in df[queen_cols].iterrows():
            fields = 0
            for col in queen_cols:
                try:
                    if len(row[col]) > 0:
                        fields = fields + 1
                except:
                    fields
            field_counts.append(fields / len(queen_cols))
        df['fields'] = field_counts
        
        #use index to virtually ensure that no two queen scores will be equal
        index_values = []
        for i in range(0,df.shape[0]):
            index_values.append((df.shape[0] - i) / (df.shape[0]*10))
        df['index_queen_score'] = index_values
        
        #ad hoc queening criteria can be manually calculated and added here
        if 'priority_flag' in df:
            priority_scores = []
            for item in df['priority_flag']:
                try:
                    if item == '1':
                        priority_scores.append(100)
                    else:
                        priority_scores.append(0)
                except:
                    priority_scores.append(0)
            df['priority_flag'] = priority_scores
        else:
            df['priority_flag'] = 0
            
        if 'priority1' not in df:
            df['priority1'] = 0
        else:
            df['priority1'] = pd.to_numeric(df['priority1'], errors='coerce')
            df['priority1'] = df['priority1'].fillna(0).astype(float)
            
        if 'priority2' not in df:
            df['priority2'] = 0
        else:
            df['priority2'] = pd.to_numeric(df['priority2'], errors='coerce')
            df['priority2'] = df['priority2'].fillna(0).astype(float)
            
        if 'priority3' not in df:
            df['priority3'] = 0
        else:
            df['priority3'] = pd.to_numeric(df['priority3'], errors='coerce')
            df['priority3'] = df['priority3'].fillna(0).astype(float)       
            
        if 'priority4' not in df:
            df['priority4'] = 0
        else:
            df['priority4'] = pd.to_numeric(df['priority4'], errors='coerce')
            df['priority4'] = df['priority4'].fillna(0).astype(float)        
            
        if 'priority5' not in df:
            df['priority5'] = 0
        else:
            df['priority5'] = pd.to_numeric(df['priority5'], errors='coerce')
            df['priority5'] = df['priority5'].fillna(0).astype(float)  
            
        if 'priority6' not in df:
            df['priority6'] = 0
        else:
            df['priority6'] = pd.to_numeric(df['priority6'], errors='coerce')
            df['priority6'] = df['priority6'].fillna(0).astype(float)  
            
        if 'priority7' not in df:
            df['priority7'] = 0
        else:
            df['priority7'] = pd.to_numeric(df['priority7'], errors='coerce')
            df['priority7'] = df['priority7'].fillna(0).astype(float)  
            
        if 'priority8' not in df:
            df['priority8'] = 0
        else:
            df['priority8'] = pd.to_numeric(df['priority8'], errors='coerce')
            df['priority8'] = df['priority8'].fillna(0).astype(float)  
            
        if 'priority9' not in df:
            df['priority9'] = 0
        else:
            df['priority9'] = pd.to_numeric(df['priority9'], errors='coerce')
            df['priority9'] = df['priority9'].fillna(0).astype(float)  
            
        if 'priority10' not in df:
            df['priority10'] = 0
        else:
            df['priority10'] = pd.to_numeric(df['priority10'], errors='coerce')
            df['priority10'] = df['priority10'].fillna(0).astype(float)  
            
        if 'priority11' not in df:
            df['priority11'] = 0
        else:
            df['priority11'] = pd.to_numeric(df['priority11'], errors='coerce')
            df['priority11'] = df['priority11'].fillna(0).astype(float)  
            
        if 'priority12' not in df:
            df['priority12'] = 0
        else:
            df['priority12'] = pd.to_numeric(df['priority12'], errors='coerce')
            df['priority12'] = df['priority12'].fillna(0).astype(float)  

        #calculating composite queen score
        try:
            df['queen_score'] = df['z_score_delta_days1'] + df['z_score_delta_days2'] + df['z_score_transactions'] \
            + df['z_score_delta_future_days1'] + df['z_score_delta_future_days2'] \
            + df['valid_penalties'] + df['email_scores'] + df['postal_scores'] + df['coa_scores'] + df['chars'] + df['fields'] \
            + df['priority_flag'] + df['priority1'] + df['priority2'] + df['priority3'] + df['priority4']  + df['priority5'] \
            + df['priority6'] + df['priority7'] + df['priority8'] + df['priority9'] + df['priority10'] + df['priority11'] + df['priority12'] + df['index_queen_score']
        except:
            df['queen_score'] = df['chars'] + df['fields'] + df['index_queen_score']
            print('WARNING: Queen Score Failure - check calculation error')
        
        df['queen_score'] = df['queen_score'].astype(float)
        
        #I need to account for matching, probably need to re-think how queen cols are determined.  We would need to take
        #only coluns that are in both datasets.  or arguably, and more simply, just base on completeness factors.
        
        ##############END QUEEN SCORE CALCULATIONS##############

        global a #this is a marker used to determine if we are working on alt data
        
        # Standard data read prep
        df = df.fillna('')
        
        #creating col lists
        state_cols = []
        for col in ['state_a','state_b', 'state_c','state_d']:
            if col in df:
                state_cols.append(col)
        phone_cols = []
        for col in ['phone_a','phone_b','phone_c','fax_a']:
            if col in df:
                phone_cols.append(col)
        email_cols = []
        for col in ['email_a','email_b','email_c','email_d']:
            if col in df:
                email_cols.append(col)
        website_cols = []
        for col in ['website_a','website_b','website_c','website_d']:
            if col in df:
                website_cols.append(col)
        org_cols = []
        for col in ['org_name','org_name2','org_name3']:
            if col in df:
                org_cols.append(col)
                
        # Normalize state codes
        for col in state_cols:
            df[col] = df[col].replace(regex=place_acronyms)
        for col in org_cols:
            df[col] = df[col].replace(regex=place_acronyms)

        # Identify nickname groups
        if 'fname' in df and not organization_only:
            df["nicks_groups"] = df["fname"].apply(lambda n: " ".join(map(lambda grp: "nick" + str(grp), nicks[n])))
            # Include phoneticization
            df['meta_fname'] = df['fname'].apply(metaphone)
        
        if 'lname' in df and not organization_only:
            df['meta_lname'] = df['lname'].apply(metaphone)
        
        #create full_name col to later use to assess name uniqueness
        if 'fname' in df and 'lname' in df and not organization_only:
            df['full_name'] = df['fname'] + ' ' + df['lname']
            if 'full_name' not in match_items.keys():
                match_items['full_name'] = 'full_name'
            if a==1:
                if 'full_name' not in cols_alt.keys():
                    cols_alt['full_name'] = 'full_name' 
                   
        #predict gender
        if 'fname' in df and not organization_only:
            gender_cols = []
            for col in df.columns:
                if col in ['gender','prefix','fname','mname']:
                    gender_cols.append(col)
            import gender_guesser.detector as gender
            gender_pred = gender.Detector(case_sensitive=False)
            if 'gender' in gender_cols:
                gender_dict = {
                    male_value:-5,
                    female_value:5
                }
            gender_name_dict = {
                'male':-2,
                'mostly_male':-1,
                'andy':0,
                'unknown':0,
                'mostly_female':1,
                'female':2
            }
            bb_gender = []
            for index, row in df.iterrows():
                g = 0
                for col in gender_cols:
                    if (col == 'gender') and (row[col] in [male_value,female_value]): #check any pre-existing gender values
                        g = g + gender_dict[row[col]]
                    if (col == 'prefix') and (len(str(row[col])))>0:
                        prefix_values = row[col].replace('.','').replace(',','').lower().split()
                        if 'mr' in prefix_values:
                            g = g-5
                        elif len(set.intersection(set(prefix_values),['ms','mrs','miss']))>0:
                            g = g+5
                    if col in ['fname','mname']:
                        for name in row[col].split():
                            g = g + gender_name_dict[gender_pred.get_gender(name)]
                if g<0:
                    bb_gender.append('male')
                elif g>0:
                    bb_gender.append('female')
                else:
                    bb_gender.append(np.nan)
            df['bb_gender'] = bb_gender
            if 'bb_gender' not in match_items.keys():
                match_items['bb_gender'] = 'bb_gender'
                
        # Reduce phone numbers to just numbers
        for col in phone_cols:
            col_name = 'clean_' + col
            df[col_name] = df[col].replace('[^0-9]', '',regex=True)
            if col_name not in match_items.keys():
                match_items[col_name] = match_items[col] + '_clean'
            if a==1:
                if col_name not in cols_alt.keys():
                    cols_alt[col_name] = cols_alt[col] + '_clean'

        # Split emails into usernames and domains
        for col in email_cols:
            before_domain_col = 'before_domain_' + col[-1:]
            domain_col = 'email_domain_' + col[-1:]
            df[[before_domain_col, domain_col]] = df[col].str.split('@', expand=True)[[0, 1]]
            # Drop all domains that are just personal domains 
            df[domain_col] = df[domain_col].map(lambda d: '' if d in list(personal_domains) else d)
            #df['domain'] = df['email_domain']
            if before_domain_col not in match_items.keys():
                match_items[before_domain_col] = match_items[col] + '_before_domain'
            if domain_col not in match_items.keys():
                match_items[domain_col] = match_items[col] + '_domain'
            if a==1:
                if before_domain_col not in cols_alt.keys():
                    cols_alt[before_domain_col] = cols_alt[col] + '_before_domain'
                if domain_col not in cols_alt.keys():
                    cols_alt[domain_col] = cols_alt[col] + '_domain'
        
        for col in website_cols:
            domain_col = 'web_domain_' + col[-1:]
            domains = []
            for web in df[col]:
                if '@' in web: #handle domain extraction in case we want to use an email address
                    domain = web.split('@')[-1]
                    if domain in list(personal_domains): 
                        domains.append('')
                    else:
                        domains.append(domain)
                elif '/' in web or 'www' in web: #assuming we have a URL
                    domains.append(web.split('//')[-1].split('/')[0].strip('www.'))
                elif '.' in web: #handling cases where URL is already a domain
                    domains.append(web)
                else:
                    domains.append('')
            df[domain_col] = domains
            if domain_col not in match_items.keys():
                match_items[domain_col] = match_items[col] + '_domain'
            if a==1:
                if domain_col not in cols_alt.keys():
                    cols_alt[domain_col] = cols_alt[col] + '_domain'

        return df

    process_chunks(pd.read_csv(ns(in_file), chunksize=chunk_size,encoding='utf-8',compression=compression,
                               dtype=object, index_col='id'),
                   _preprocess,
                   ns(out_file))

    print("states, phones, domains normalized --- %s seconds ---" % (time.time() - start_time))

if matching:
    preprocess("data_l.csv", "processed_l.csv", match_items=cols)
    a=1 #marker to indicate that we are working on alt data
    preprocess("data_r.csv", "processed_r.csv", match_items=cols_alt)
else:
    preprocess()

### Tokenize Dataset

Take our dataset and count up all the tokens inside to create our tokenlists

In [None]:
def tokenize(cell):
    """
    Reduces cell string contents to lowercase
    alphanumeric characters, then splits into a list on space.
    
    So given one string "  Aaa bBb ccC ", this returns ["aaa", "bbb", "ccc"]
    """
    if cell is None: return cell
    # For every lowercased word (from .lower.split), filter out non-alphanumeric chars
    # and then ''.join it back together to get a list of tokens.
    return map(lambda string: ''.join(filter(str.isalnum, string)), re.split('\W+',cell.lower()))

def extract_tokens(df):
    """
    A chunk processing function.
    For each chunk, select all relevant columns
        -> df.columns.isin(...) chooses all the columns in df that are
           included in the given list
    and use applymap to process every single cell
        -> applymap(lambda cell: ...)
    so that, for each cell, we 1) tokenize it to get a list, and then
    2) update the counter with them with tokens.update(...)
    
    FYI: Counter is a dict(). Counter.update updates the counts for the
    passed in values, e.g. Counter.update([1,2,3,2,1]) -> {1: 2, 2: 2, 3: 1}
    """
    df = df.fillna('')
    df.loc[:, df.columns.isin(full_token_columns)] \
        .applymap(lambda cell: tokens.update(tokenize(cell)))

# lowercase the name and split on spaces, remove non-alphanumeric chars
tokens = Counter()

# These calls to process_chunks don't write to a file; their purpose is filling
# up the tokens Counter dicitonary.
if matching:
    process_chunks(pd.read_csv(ns("processed_l.csv"), chunksize=chunk_size,encoding='utf-8',compression=compression,
                           dtype=object, index_col='id'),
               extract_tokens)
    process_chunks(pd.read_csv(ns("processed_r.csv"), chunksize=chunk_size,encoding='utf-8',compression=compression,
                           dtype=object, index_col='id'),
               extract_tokens)
else:
    process_chunks(pd.read_csv(ns("processed.csv"), chunksize=chunk_size,encoding='utf-8',compression=compression,
                               dtype=object, index_col='id'),
                   extract_tokens)

In [None]:
token_dict = dict(tokens)
tok_df = pd.DataFrame(list(token_dict.values()), index=token_dict.keys(), columns=['count'])
tok_df.reset_index(inplace=True)
tok_df.rename(columns={'index':'token'},inplace=True)
# Filter out single-instance tokens
tok_df = tok_df.loc[tok_df['count'] > 1]
# Filter out tokens over the commonality threshold set by token_limiter
tok_df = tok_df.sort_values('count')
tok_df = tok_df.reset_index(drop=True)
stopwords = list(tok_df[tok_df.index >= (len(tok_df) * token_limiter)].token.drop_duplicates()) #consider using for CountVectorizer?
tok_df = tok_df[tok_df.index < (len(tok_df) * token_limiter)]

# CountVectorizer creates a frequency count of every token in the vocab. We feed it the whole
# dataframe (constructed by str.join(' ')-ing the whole frame as a list of strings) to get a
# frequency count of all the tokens per row.
cv = CountVectorizer(binary=True)
vocab = np.array(list(map(cv.build_preprocessor(), tok_df['token'].unique())), dtype=object)
cv.set_params(vocabulary=vocab)

def count_freqs(df):
    token_cols = []
    for t in full_token_columns:
        if t in df:
            token_cols.append(t)
    tokenmapping = cv.fit_transform(pd.Series(df[token_cols].fillna('').values.tolist()).str.join(' '))
    row_index, vocab_index = tokenmapping.nonzero()

    tokenids = pd.DataFrame({"token": vocab[vocab_index],
                             "token_id": vocab_index,
                             "id": df.index.values[row_index]})
    tokenids = tokenids.drop_duplicates()
    return tokenids

if matching:
    process_chunks(pd.read_csv(ns("processed_l.csv"), chunksize=chunk_size,encoding='utf-8',compression=compression,
                               dtype=object, index_col='id'),
                   count_freqs,
                   ns("tokenlists_l.csv"),
                   index=False)
    process_chunks(pd.read_csv(ns("processed_r.csv"), chunksize=chunk_size,encoding='utf-8',compression=compression,
                               dtype=object, index_col='id'),
                   count_freqs,
                   ns("tokenlists_r.csv"),
                   index=False)
else:
    process_chunks(pd.read_csv(ns("processed.csv"), chunksize=chunk_size,encoding='utf-8',compression=compression,
                               dtype=object, index_col='id'),
                   count_freqs,
                   ns("tokenlists.csv"),
                   index=False)

# Calculate bonus tokens by getting token list from tok_df, then turning
# tokenids into a token -> tok_id dictionary and translating the bonus
# tokens with .map(dict)
# Clean it up by dropping NAs, making it int type instead of float, and
# grab just the values as a numpy.array
bonus_toks = tok_df['token'][tok_df['count'] <= unique_token_freq_max] \
                 .map(vocab.tolist().index) \
                 .dropna() \
                 .astype(int) \
                 .values

In [None]:
stopwords

### Generate Candidates

Take the tokenlists and join them chunk by chunk until a candidate list is formed. Then add all the feature data back in once the ID's for the candidates are determined.

In [None]:
print('SELECTING CANDIDATES')
start_time = time.time()

'''
#old way of creating pair, we retired because it seemed to be creating improper pair IDs 
#that then led to missing valid candidates

def unique_id_pairs(df, id_l='id_l', id_r='id_r', sep=":"):
    """
    Given a dataframe with the given two ID columns, creates a single ID "footprint" such that
    IDA, IDB and IDB, IDA have the same footprint. Basically IDA and IDB are sorted alphabetically
    and then joined with the separator.
    
    e.g. {id_l: 10, id_r: 20, sep: ":"} -> "10:20", and {id_l: 20, id_r: 10, sep: ":"} -> "10:20"
    """
    # df[[id_l, id_r]].to_numpy(copy=True)
    #     Take the two columns and convert them to numpy arrays, to get [[id_l1, id_r1], [id_l2, id_r2], ...]
    # np.sort(...)
    #     Sort each ID pair in the list, so that each sublist pair is in alphabetical order
    # apply_along_axis(sep.join, -1, _)
    #     For each sublist pair, join them together with the separator
    return np.apply_along_axis(sep.join, -1, np.sort(df[[id_l, id_r]].to_numpy(copy=True)))
'''
# Change where the rightside comes from depending on if we're matching
tokens_right = pd.read_csv(ns('tokenlists_r.csv' if matching else 'tokenlists.csv'),encoding='utf-8',compression=compression,
                           usecols=['id', 'token_id'], dtype={'id': object, 'token_id': int}) \
                 .set_index('token_id')

p=0
def select_candidates(chunk):
    global p
    global pairs
    joined = chunk.set_index('token_id').join(tokens_right, how='inner', lsuffix='_l', rsuffix='_r')

    # Get all rows that are bonus tokens, then copy them and add them to the joined again
    bonuses = np.intersect1d(bonus_toks, joined.index.values)
    joined = pd.concat([joined, joined.loc[bonuses].copy()])

    match_count = joined.groupby(['id_l', 'id_r']).size()
    match_count = match_count[match_count >= token_match_min]

    if not matching:
        # Get rid of self matches
        candidates = match_count.reset_index().drop(0, axis="columns")
        candidates = candidates[candidates['id_l'] != candidates['id_r']]

        # Get rid of flip-flopped duplicate matches
        #candidates = candidates.assign(pair=unique_id_pairs(candidates)) #old way of deriving pair
        candidates['pair'] = candidates.apply(lambda row: ':'.join(sorted([row.id_l,row.id_r])), axis = 1)
        candidates = candidates.drop_duplicates("pair")
        if p==0:
            pairs = candidates[['pair']]
            p=1
        else:
            candidates = candidates[~candidates['pair'].isin(pairs.pair)]
            pairs = pd.concat([pairs,candidates[['pair']]],ignore_index=True)
            pairs = pairs.drop_duplicates('pair')
        candidates = candidates.reset_index(drop=True)

    else:
        candidates = match_count.reset_index().drop(0, axis="columns")
        # Get rid of flip-flopped duplicate matches
        #candidates = candidates.assign(pair=unique_id_pairs(candidates)) #old way of deriving pair
        candidates['pair'] = candidates.apply(lambda row: ':'.join(sorted([row.id_l,row.id_r])), axis = 1)
        candidates = candidates.drop_duplicates("pair")
        if p==0:
            pairs = candidates[['pair']]
            p=1
        else:
            candidates = candidates[~candidates['pair'].isin(pairs.pair)]
            pairs = pd.concat([pairs,candidates[['pair']]],ignore_index=True)
            pairs = pairs.drop_duplicates('pair')
        candidates = candidates.reset_index(drop=True)

    return candidates


#print("Expected chunks:")
#print(int(pd.read_csv(ns('tokenlists_l.csv' if matching else 'tokenlists.csv')).shape[0] / chunk_size ** 0.5))
#print('')
process_chunks(pd.read_csv(ns('tokenlists_l.csv' if matching else 'tokenlists.csv'),encoding='utf-8',compression=compression,
                           chunksize=chunk_size ** 0.5, usecols=['id', 'token_id'],
                           dtype={'id': object, 'token_id': int}),
               select_candidates,
               ns("candidates.csv"),
               index=False)
print("Candidates selected in", time.time() - start_time, "seconds")

In [None]:
len(pairs)
#consider using tuple rather than string pairs to conserve memory, I think it is more efficient
#but need to confirm that we can use tuples the same way that pairs are being used at present

In [None]:
#while this is clear in my head
#there is a problem here wherein duplicate candidate pairs are not being removed due to being outside of a given chunk (I tink)
#reason is we continuously match against the full tokenlist so same unique pair may be added again in a different chunk
#potential solutions:
#1. store complete list of unique pairs in memory and check against it <--going with this for now
#2. dynamically slide down in tokenlist index so we don't re-compare the same two records again

In [None]:
start_time = time.time()
print ("MERGING MATCH CANDIDATES WITH ORIGINAL DATA...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

if matching:
    left_cols = list(map(lambda col: col + '_l', cols.keys()))
    left_df = pd.read_csv(ns("processed_l.csv"), encoding='utf-8',compression=compression
                          ,dtype=object) \
                .set_index('id') \
                .fillna('') \
                .rename(columns=dict(zip(cols.keys(), left_cols)))
    left_df = left_df.loc[:, left_df.columns.isin(left_cols)]

    right_cols = list(map(lambda col: col + '_r', cols_alt.keys()))
    right_df = pd.read_csv(ns("processed_r.csv"), encoding='utf-8',compression=compression
                           ,dtype=object) \
                .set_index('id') \
                .fillna('') \
                .rename(columns=dict(zip(cols_alt.keys(), right_cols)))
    right_df = right_df.loc[:, right_df.columns.isin(right_cols)]
else:
    left_cols = list(map(lambda col: col + '_l', cols.keys()))
    right_cols = list(map(lambda col: col + '_r', cols.keys()))

    df = pd.read_csv(ns("processed.csv"), encoding='utf-8',compression=compression
                     ,dtype=object).set_index('id').fillna('')
    # Create a left_df and a right_df that consist of the columns dictated by cols
    # that was set in the parameter setting part at the top of the program
    left_df = df.rename(columns=dict(zip(cols.keys(), left_cols)))
    left_df = left_df.loc[:, left_df.columns.isin(left_cols)]
    right_df = df.rename(columns=dict(zip(cols.keys(), right_cols)))
    right_df = right_df.loc[:, right_df.columns.isin(right_cols)]

def merge_data(chunk):
    candidata = pd.merge(chunk, left_df,
                         left_on='id_l', right_index=True)
    candidata = pd.merge(candidata, right_df,
                         left_on='id_r', right_index=True)
    return candidata

#print("Expected chunks:")
#print(int(pd.read_csv(ns('candidates.csv')).shape[0] / chunk_size ))
#print('')
process_chunks(pd.read_csv(ns('candidates.csv'),encoding='utf-8',compression=compression,
                           chunksize=small_chunk_size, dtype=object), #small chunk size because this cell is doing a large merge
               merge_data,
               ns("candidata.csv"),index=False)

print("original data merged with matches --- %s seconds ---" % (time.time() - start_time))

# Scoring

Scoring is done in three mostly arbitrary chunks.
1. Organization name simularity
    - This is done with Jaro-Winkler and three fuzz-type tests
2. Organization name sequence uniqueness
3. Various field similarities
    - names, states, cities, zips, etc.
    - These depend on the presence of the required columns in the data in the first place

In [None]:
start_time = time.time()
print ("SCORING ORG, PERSON NAME SIMULARITY...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#scoring match candidates based on edit distance of org names
def score_compare(left, right, method):
    """
    Given two strings, returns the comparison score based on the specified methodology.
    """
    if len(left) > 0 and len(right) > 0:
        if (method == "jaro"): return jaro_winkler(left, right)
        if (method == "fuzz-partial"): return fuzz.partial_ratio(left, right) / 100
        if (method == "fuzz-sort"): return fuzz.token_sort_ratio(left, right) / 100
        if (method == "fuzz-set"): return fuzz.token_set_ratio(left, right) / 100
        raise ValueError("Method {} unknown; " \
                         "must be 'jaro', 'fuzz-partial', 'fuzz-sort' or 'fuzz-set'".format(method))
    else: return np.nan

def name_match(left, right):
    left_isalnum = ''.join(e for e in left if e.isalnum()) #removing spaces and special characters from left and right
    right_isalnum = ''.join(e for e in right if e.isalnum())
    if len(left_isalnum) > 0 and len(right_isalnum) > 0:
        return score_compare(left_isalnum, right_isalnum, 'jaro')
    #elif len(left) > 0 and len(right) > 0: #we may as well still score two orgs named '$$#' and '#$$'
        #return score_compare(left, right, 'jaro')
    else: return np.nan

def get_field(data, field, left_suffix='_l', right_suffix='_r'):
    """
    For a list of series, this function returns a list of fields from those series.
    So field_of('name', [row_bob, row_joe]) might return ["Bob", "Joe"].
    """
    return (data[field + left_suffix], data[field + right_suffix])

def get_org_data(data, left_suffix='_l', right_suffix='_r'):
    #handle gathering and concatenating of org name-type data
    
    left_cols = []
    right_cols = []
    for org in ['org_name','org_name2','org_name3']:
        if org + left_suffix in data:
            left_cols.append(org + left_suffix)
        if org + right_suffix in data:
            right_cols.append(org + right_suffix)
    return (' '.join(data[left_cols]), ' '.join(data[right_cols]))
    
def dedup_list(x):
    return list(dict.fromkeys(x))

def get_multi_field(data, field, left_suffix='l', right_suffix='r'):
    """
    handle data types with multiple per record, ex: phones, addresses
    """
    left_cols = []
    right_cols = []
    for t in ['_a_','_b_','_c_','_d_']:
        if field + t + left_suffix in data:
            left_cols.append(field + t + left_suffix)
        if field + t + right_suffix in data:
            right_cols.append(field + t + right_suffix)
    return (dedup_list(data[left_cols]),dedup_list(data[right_cols]))

def get_domain_fields(data, email, web, left_suffix='l',right_suffix='r'):
    '''
    handle domains which are special case as they come from both email and website fields
    '''
    left_cols = []
    right_cols = []
    for t in ['_a_','_b_','_c_','_d_']:
        if email + t + left_suffix in data:
            left_cols.append(email + t + left_suffix)
        if web + t + left_suffix in data:
            left_cols.append(web + t + left_suffix)
        if email + t + right_suffix in data:
            right_cols.append(email + t + right_suffix)
        if web + t + right_suffix in data:
            right_cols.append(web + t + right_suffix)
    return (dedup_list(data[left_cols]),dedup_list(data[right_cols]))

'''#removing as this is redundant with how we now handle gender
personal_prefixes = ['mr','mrs','ms','miss']
def prefix_match(left, right):
    if len(left) > 0 and len(right) > 0:
        l_prefix = left.replace('.','').replace(',','').split()
        l_personal_prefix = set.intersection(set(l_prefix),personal_prefixes)
        if len(l_personal_prefix) > 0:
            r_prefix = right.replace('.','').replace(',','').split()
            r_personal_prefix = set.intersection(set(r_prefix),personal_prefixes)
            if len(r_personal_prefix) > 0: #so now we at least know there is personal prefixes on both sides
                if ('mr' in l_personal_prefix and 'mr' not in r_personal_prefix) | ('mr' in r_personal_prefix and 'mr' not in l_personal_prefix):
                    return 0
                else:
                    return 1
            else:
                return np.nan
        else:
            return np.nan
    else:
        return np.nan
'''

personal_suffixes = ['ii','iii','iv','v','jr','sr']
def suffix_match(left, right):
    if len(left) > 0 and len(right) > 0:
        l_suffix = left.replace('.','').replace(',','').split()
        l_personal_suffix = set.intersection(set(l_suffix),personal_suffixes)
        if len(l_personal_suffix) > 0:
            r_suffix = right.replace('.','').replace(',','').split()
            r_personal_suffix = set.intersection(set(r_suffix),personal_suffixes)
            if len(r_personal_suffix) > 0:
                if (len(set.intersection(l_personal_suffix,r_personal_suffix)) > 0):
                    return 1
                else:
                    return 0
            else:
                return np.nan
        else:
            return np.nan
    else:
        return np.nan
    
def m_initial_rejection(left, right):
    if (len(left) > 0) and (len(right) > 0):
        l_mname = left.replace(',','')
        r_mname = right.replace(',','')
        if (len(l_mname) == 1) and (len(r_mname) == 1):
            if l_mname == r_mname:
                return 1
            else:
                return 0
        else:
            return np.nan
    else:
        return np.nan
    
def gender_match(left, right):
    if (len(left)>0) and (len(right)>0):
        if left == right:
            return 1
        else:
            return 0
    else: return np.nan

subsidiary_words = ['chapter','division', 'department']
def org_word_mismatch(left, right):
    if (len(left)>0) and (len(right)>0):
        left_states = set.intersection(set(left.split()),us_states['acronym'])
        right_states = set.intersection(set(right.split()),us_states['acronym'])
        len_left_states = len(left_states)
        len_right_states = len(right_states)
        left_subsidiary = set.intersection(set(left.split()),subsidiary_words)
        right_subsidiary = set.intersection(set(right.split()),subsidiary_words)
        len_left_subsidiary = len(left_subsidiary)
        len_right_subsidiary = len(right_subsidiary)
        #check if there are mis-matched keywords
        if ((len_left_states > 0) and (len_right_states == 0)) | ((len_right_states > 0) and (len_left_states == 0)):
            return 0
        elif ((len_left_subsidiary > 0) and (len_right_subsidiary == 0)) | ((len_right_subsidiary > 0) and (len_left_subsidiary == 0)):
            return 0
        #retain matches that share the same states
        elif (len_left_states > 0) and (len_right_states > 0):
            if len(set.intersection(left_states,right_states))>0:
                return 1
            else:
                return 0
        #ignore all others
        else:
            return np.nan
        
def name_scores(df):
    prelim_score_cols = []
    reject_cols = []
    if 'org_name_l' in df:
        jaro_time = time.time()
        df['jaro_score'] = df.apply(lambda row:
            score_compare(*get_field(row, 'org_name'), 'jaro'),
            axis="columns")
        prelim_score_cols.append('jaro_score')

        print("\tjaro scores done --- %s seconds ---" % (time.time() - jaro_time))

        # consider skipping these; they have no major effect on outcomes & require a lot of compute
        '''
        partial_time = time.time()
        df['fuzz_partial_score'] = df.apply(lambda row:
            score_compare(*get_field(row, 'org_name'), 'fuzz-partial'),
            axis="columns")
        print("\tfuzz partial scores done --- %s seconds ---" % (time.time() - partial_time))

        sort_time = time.time()
        df['fuzz_sort_score'] = df.apply(lambda row:
            score_compare(*get_field(row, 'org_name'), 'fuzz-sort'),
            axis="columns")
        print("\tfuzz sort scores done --- %s seconds ---" % (time.time() - sort_time))

        set_time = time.time()
        df['fuzz_set_score'] = df.apply(lambda row:
            score_compare(*get_field(row, 'org_name'), 'fuzz-set'),
            axis="columns")
        print("\tfuzz set scores done --- %s seconds ---" % (time.time() - set_time))
        '''
        
    if 'fname_l' in df and not organization_only:
        start_time = time.time()
        print ("\tCHECKING FOR FIRST NAME MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

        df['fname_match'] = df.apply(lambda row:
            name_match(*get_field(row, 'fname')),
            axis="columns")
        prelim_score_cols.append('fname_match')
        print("\tfname values compared --- %s seconds ---" % round(time.time() - start_time, 2))

    if 'lname_l' in df and not organization_only:
        start_time = time.time()
        print ("\tCHECKING FOR LAST NAME MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

        df['lname_match'] = df.apply(lambda row:
            name_match(*get_field(row, 'lname')),
            axis="columns")
        prelim_score_cols.append('lname_match')
        print("\tlname values compared --- %s seconds ---" % round(time.time() - start_time, 2))
    '''
    #redundant with gender amtch
    if 'prefix_l' in df and not organization_only:
        start_time = time.time()
        print ("\tCHECKING FOR PREFIX MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        
        df['prefix_match'] = df.apply(lambda row:
            prefix_match(*get_field(row, 'prefix')),
            axis='columns')
        reject_cols.append('prefix_match')
        print("\tprefix values compared --- %s seconds ---" % round(time.time() - start_time, 2))
    '''    
    if 'person_suffix_l' in df and not organization_only:
        start_time = time.time()
        print ("\tCHECKING FOR SUFFIX MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        
        df['suffix_match'] = df.apply(lambda row:
            suffix_match(*get_field(row, 'person_suffix')),
            axis='columns')
        reject_cols.append('suffix_match')
        print("\tsuffix values compared --- %s seconds ---" % round(time.time() - start_time, 2))
    
    if 'mname_l' in df and not organization_only:
        start_time = time.time()
        print ("\tCHECKING FOR MIDDLE INITIAL MIS-MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        
        df['m_initial_rejection'] = df.apply(lambda row:
            m_initial_rejection(*get_field(row, 'mname')),
            axis='columns')
        reject_cols.append('m_initial_rejection')
        print("\tmname values compared --- %s seconds ---" % round(time.time() - start_time, 2))
        
    if 'bb_gender_l' in df and not organization_only:
        start_time = time.time()
        print ("\tCHECKING FOR GENDER MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        
        df['gender_match'] = df.apply(lambda row:
            gender_match(*get_field(row, 'bb_gender')),
            axis='columns')
        reject_cols.append('gender_match')
        print("\tgender values compared --- %s seconds ---" % round(time.time() - start_time, 2))
    
    if 'org_name_l' in df and organization_only:
        start_time = time.time()
        print ("\tCHECKING FOR REGIONAL/SUBSIDIARY MIS-MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        
        df['org_word_mismatch'] = df.apply(lambda row:
            org_word_mismatch(*get_org_data(row)),
            axis='columns')
        reject_cols.append('org_word_mismatch')
        print("\torg names compared --- %s seconds ---" % round(time.time() - start_time, 2))
        
    #calculating preliminary score that we will use to suppress unlikely matches from further calculation
    #arguably I should also include some "score boosters" like email is exact match, we dont want to suppress
    if len(prelim_score_cols)>0:
        start_time = time.time()
        prelim_score = []
        for index, row in df.iterrows():
            v = 0
            for col in prelim_score_cols:
                if pd.isnull(row[col]):
                    v = v+1
                else:
                    v = v+row[col]
            for col in reject_cols: #change value to 0 for any rejection criteria
                if row[col] == 0:
                    v = 0
            prelim_score.append(v / len(prelim_score_cols))
        df['prelim_score'] = prelim_score
    else:
        start_time = time.time()
        df['prelim_score'] = 1
        
    #df['prelim_score'] = pd.to_numeric(df['prelim_score'], errors='coerce')
    print("\tprelim score calculated --- %s seconds ---" % round(time.time() - start_time, 2))
    
    return df
#print("Expected chunks:")
#print(int(pd.read_csv(ns('candidata.csv')).shape[0] / chunk_size ))
#print('')
process_chunks(pd.read_csv(ns('candidata.csv'),encoding='utf-8',compression=compression,
                           chunksize=chunk_size, dtype=object),
               name_scores,
               ns("scored-1.csv"),
               index=False)

print("name simularity scored --- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print ("REDUCE, REMOVING UNLIKELY MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

def reduce_matches(df):
    return df[df.prelim_score >= reduce_threshold]

process_chunks(pd.read_csv(ns('scored-1.csv'),encoding='utf-8',compression=compression,
                           chunksize=chunk_size),# dtype=object),
               reduce_matches,
               ns("scored-1_reduced.csv"),
               index=False)

print("match candidates reduced --- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print ("SCORING ORG NAME SEQUENCE UNIQUENESS...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

org_names_cnt = Counter()
person_names_cnt = Counter()

def count_orgnames(df):
    if 'org_name' in df:
        df['org_name'].dropna().apply(lambda c: org_names_cnt.update(tokenize(c)))

if matching:
    process_chunks(pd.read_csv(ns("processed_l.csv"), chunksize=chunk_size,encoding='utf-8',compression=compression,
                               dtype=object, index_col='id'),
                   count_orgnames)
    process_chunks(pd.read_csv(ns("processed_r.csv"), chunksize=chunk_size,encoding='utf-8',compression=compression,
                               dtype=object, index_col='id'),
                   count_orgnames)
else:
    process_chunks(pd.read_csv(ns("processed.csv"), chunksize=chunk_size,encoding='utf-8',compression=compression,
                               dtype=object, index_col='id'),
                   count_orgnames)
    
def org_name_similarity(left, right):
    def org_sequence_uniqueness(seq):
        try:
            return sum(1 / org_names_cnt[t.lower()] ** 0.5 for t in seq)
        except:
            print(seq, org_names_cnt)
            raise ValueError()

    if len(left) > 0 and len(right) > 0:
        left_toks = set(tokenize(left))
        right_toks = set(tokenize(right))

        left_uniq = org_sequence_uniqueness(left_toks)
        right_uniq = org_sequence_uniqueness(right_toks)

        return org_sequence_uniqueness(left_toks & right_toks) / (left_uniq * right_uniq) ** 0.5
    else: return np.nan

if not organization_only:
    def count_personnames(df):
        df['full_name'].dropna().apply(lambda c: person_names_cnt.update(tokenize(c)))

    if matching:
        process_chunks(pd.read_csv(ns("processed_l.csv"), chunksize=chunk_size,encoding='utf-8',compression=compression,
                                   dtype=object, index_col='id'),
                       count_personnames)
        process_chunks(pd.read_csv(ns("processed_r.csv"), chunksize=chunk_size,encoding='utf-8',compression=compression,
                                   dtype=object, index_col='id'),
                       count_personnames)
    else:
        process_chunks(pd.read_csv(ns("processed.csv"), chunksize=chunk_size,encoding='utf-8',compression=compression,
                                   dtype=object, index_col='id'),
                       count_personnames)
        
    def person_name_similarity(left, right):
        def person_sequence_uniqueness(seq):
            try:
                return sum(1 / person_names_cnt[t.lower()] ** 0.5 for t in seq)
            except:
                print(seq, org_names_cnt)
                raise ValueError()

        if len(left) > 0 and len(right) > 0:
            left_toks = set(tokenize(left))
            right_toks = set(tokenize(right))

            left_uniq = person_sequence_uniqueness(left_toks)
            right_uniq = person_sequence_uniqueness(right_toks)

            return person_sequence_uniqueness(left_toks & right_toks) / (left_uniq * right_uniq) ** 0.5
        else: return np.nan

def calculate_unique(df):
    if 'org_name_l' in df:
        df['org_uniq'] = df.apply(lambda row:
            org_name_similarity(*get_field(row, 'org_name')),
            axis="columns")
    #else:
        #df['org_uniq'] = np.nan
    if not organization_only:
        if 'full_name_l' in df:
            df['person_uniq'] = df.apply(lambda row:
                person_name_similarity(*get_field(row, 'full_name')),
                axis="columns")
        #else:
            #df['person_uniq'] = np.nan
    return df

#print("Expected chunks:")
#print(int(pd.read_csv(ns('scored-1.csv')).shape[0] / chunk_size ))
#print('')
process_chunks(pd.read_csv(ns("scored-1_reduced.csv"),encoding='utf-8',compression=compression,
                           chunksize=chunk_size, dtype=object),
               calculate_unique,
               ns("scored-2.csv"),
               index=False)

print("name uniqueness scored --- %s seconds ---" % round(time.time() - start_time, 2))

In [None]:
def word_match(left, right): #same as name match but does not remove spaces and special characters
    if len(left) > 0 and len(right) > 0:
        return score_compare(left, right, 'jaro')
    else: return np.nan
    
def multi_word_match(left, right): #handling cities, emails which could have many of each
    match_scores = []
    for word_l in left:
        for word_r in right:
            if len(word_l) > 0 and len(word_r) > 0:
                match_scores.append(score_compare(word_l, word_r, 'jaro'))
    if len(match_scores)>0:
        return max(match_scores)
    else:
        return np.nan
    
def initial_match(left, right):
    if len(left) > 0 and len(right) > 0:
        if str(left[0]).lower() == str(right[0]).lower():
            return 1
        else:
            return 0
    else:
        return np.nan
    
def match_fields(candidata):
    if 'state_a_l' in candidata:
        start_time = time.time()
        print ("\tCHECKING FOR STATE CODE MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

        def state_match(left, right):
            state_match_scores = []
            for state_l in left:
                for state_r in right:
                    if len(state_l) >=2 and len(state_r) >= 2:
                        if state_l == state_r:
                            state_match_scores.append(1)
                        else:
                            state_match_scores.append(0)
            if len(state_match_scores)>0:
                return max(state_match_scores)
            else:
                return np.nan        
            
        candidata['state_match'] = candidata.apply(lambda row:
            state_match(*get_multi_field(row, 'state')),
            axis="columns")

        print("\tstate codes checked --- %s seconds ---" % round(time.time() - start_time, 2))

    if 'fname_l' in candidata and not organization_only:
        start_time = time.time()
        print ("\tCHECKING FOR FIRST NAME MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

        #check if first initials match
        candidata['f_initial_match'] = candidata.apply(lambda row:
            initial_match(*get_field(row, 'fname')),
            axis='columns')
        
        print("\tfname values compared --- %s seconds ---" % round(time.time() - start_time, 2))
        
        # If l_fname is present then the nick group should be too
        start_time = time.time()
        print("\tCHECKING FOR NICKNAME GROUP MATCHES...")

        # This is a little complicated, but essentially it counts how many nicks groups are shared:
        #
        # get_field(row, 'nicks_groups'):
        #     look up the data for each row, and get the nicks_groups from the left and right.
        # map(lambda nicks: set(nicks.split()), _):
        #     for the left and right nicks in nicks_groups, split it and create a set of nick groups
        # set.intersection(*_):
        #     pass these two sets to set.intersection to get the intersection of the two
        # len(_):
        #     take the length of the intersection
        
        #changed to a 0/1 flag rather then len of intersection
        def nick_match(left, right):
            if len(left) > 4 and len(right) > 4:
                if len(set.intersection(set(left.split()),set(right.split()))) > 0:
                    return 1
                else:
                    return 0
            else:
                return np.nan
        candidata['nick_matches'] = candidata.apply(lambda row:
                                                         nick_match(*get_field(row, 'nicks_groups')),
                                                         axis='columns')
        
        print('\tnickname group matches compared --- %s seconds ---' % round(time.time() - start_time, 2))
        
    if 'mname_l' in candidata and not organization_only:
        start_time = time.time()
        print ("\tCHECKING FOR MIDDLE INITIAL MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        
        candidata['m_initial_match'] = candidata.apply(lambda row:
            initial_match(*get_field(row, 'mname')),
            axis='columns')
        print("\tmname values compared --- %s seconds ---" % round(time.time() - start_time, 2))
        
    if 'city_a_l' in candidata:
        start_time = time.time()
        print ("\tCHECKING FOR CITY MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

        candidata['city_match'] = candidata.apply(lambda row:
            multi_word_match(*get_multi_field(row, 'city')),
            axis="columns")
        print("\tcity values compared --- %s seconds ---" % round(time.time() - start_time, 2))

    if 'zip_a_l' in candidata:
        start_time = time.time()
        print ("\tCHECKING FOR POSTAL CODE MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

        def postal_similarity(left, right):
            zip_match_scores = []
            for zip_l in left:
                for zip_r in right:
                    # if the number is too short, means it's fubar
                    if len(zip_l) >= 5 and len(zip_r) >= 5:
                        if max(len(sub) for sub in find_common_subsequences(zip_l[:5], zip_r[:5])) / 5 == 1:
                            zip_match_scores.append(1)
                        else:
                            zip_match_scores.append(0)
            if len(zip_match_scores)>0:
                return max(zip_match_scores)
            else:
                return np.nan

        candidata['zip_match'] = candidata.apply(lambda row:
            postal_similarity(*get_multi_field(row, 'zip')),
            axis="columns")

        print("\tpostal codes checked --- %s seconds ---" % round(time.time() - start_time, 2))

    if 'before_domain_a_l' in candidata and not organization_only:
        start_time = time.time()
        print ("\tCHECKING FOR BEFORE EMAIL DOMAIN MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

        candidata['before_domain_match'] = candidata.apply(lambda row:
            multi_word_match(*get_multi_field(row, 'before_domain')),
            axis="columns")

        print("\temail before domains checked --- %s seconds ---" % round(time.time() - start_time, 2))

    if ('email_domain_a_l' in candidata) | ('web_domain_a_l' in candidata):
        start_time = time.time()
        print ("\tCHECKING FOR WEB DOMAIN MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

        def domain_match(left, right):
            domain_match_scores = []
            for domain_l in left:
                for domain_r in right:
                    if len(domain_l) > 4 and len(domain_r) > 4:
                        if domain_l == domain_r:
                            domain_match_scores.append(1)
                        else:
                            domain_match_scores.append(0)
            if len(domain_match_scores)>0:
                return max(domain_match_scores)
            else:
                return np.nan

        candidata['domain_match'] = candidata.apply(lambda row:
            domain_match(*get_domain_fields(row, 'email_domain', 'web_domain')),
            axis="columns")

        print("\tweb domains checked --- %s seconds ---" % round(time.time() - start_time, 2))

    if 'clean_phone_a_l' in candidata:
        start_time = time.time()
        print ("\tCHECKING FOR PHONE MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

        #scoring match candidates based on matching phone
        def phone_simularity(left, right):
            phone_match_scores = []
            for phone_l in left:
                for phone_r in right:
                    if len(phone_l) > 9 and len(phone_r) > 9:
                        if max(len(sub) for sub in find_common_subsequences(phone_l, phone_r)) / 10 >= 1:
                            phone_match_scores.append(1)
                        else:
                            phone_match_scores.append(0)
            if len(phone_match_scores)>0:
                return max(phone_match_scores)
            else:
                return np.nan

        candidata['phone_match'] = candidata.apply(lambda row:
            phone_simularity(*get_multi_field(row, 'clean_phone')),
            axis="columns")

        print("\tphones checked --- %s seconds ---" % round(time.time() - start_time, 2))
    
    return candidata

#print("Expected chunks:")
#print(int(pd.read_csv(ns('scored-2.csv')).shape[0] / chunk_size ))
#print('')
process_chunks(pd.read_csv(ns("scored-2.csv"), chunksize=chunk_size, dtype=object,encoding='utf-8',compression=compression),
               match_fields,
               ns("scored-3.csv"),

               index=False)

In [None]:
#make sure we add nicks_groups column to column dictionaries so they can be properly renamed at the end
if (cols['nicks_groups'] is None) & (cols['fname'] is not None):
    cols['nicks_groups'] = 'nicks_groups'
if matching:
    if (cols_alt['nicks_groups'] is None) & (cols_alt['fname'] is not None):
        cols_alt['nicks_groups'] = 'nicks_groups'

In [None]:
def impute_chunks(chunks, chunk_func, outfile=None, silent=False, **kwargs):
    """
    Same as process_chunks but changing the fillna('') line
    """
    start_time = time.time()
    total_rows = 0
    needs_init = True
    for count, chunk in enumerate(chunks):
        local_time = time.time()

        # Some preprocessing that needs to be done every read_csv
        #chunk = chunk.fillna('')

        processed = chunk_func(chunk)
        
        if outfile:
            # mode=w overwrites the file, which we want on initial write
            # mode=a appends to the end of file, which we want as we process more chunks
            # header=True puts a header on the top, which we want on initial write
            # header=False omits the header, which we want as we add more chunks to the existing file

            default_args = {'mode': 'w' if needs_init else 'a', 'header': needs_init, 'compression': compression, 'encoding':'utf-8'}
            # **dict(default, **kwargs) creates a new dictionary of arguments. It basically combines
            # the defaults with the passed in arguments. It also allows the keyword arguments to override
            # any of the default arguments.
            #
            # Note: function(**{'a': 1, 'b': 2, 'c': 3}) == function(a=1, b=2, c=3)
            processed.to_csv(outfile, **dict(default_args, **kwargs))
            needs_init = False

        total_rows += len(chunk)

        if not silent:
            print("Processed chunk", count,
                  round(time.time() - local_time, 2), "seconds")
    if not silent:
        print("Finished in", round(time.time() - start_time, 2), "seconds")

In [None]:
#use labeled matches to train logistic regression to predict matches
pers_train = pd.read_csv('data/person_training_records_w_imputed_features.csv').dropna(how='all')
org_train = pd.read_csv('data/org_training_records_w_imputed_features.csv').dropna(how='all')

if organization_only:
    org_features = [
        'jaro_score',
        'fuzz_partial_score',
        'fuzz_sort_score',
        'fuzz_set_score',
        'org_uniq',
        'city_match',
        'state_match',
        'zip_match',
        'domain_match',
        'phone_match' #include phone match for org matching
    ]
else:
    org_features = [
        'jaro_score',
        'fuzz_partial_score',
        'fuzz_sort_score',
        'fuzz_set_score',
        'org_uniq',
        'city_match',
        'state_match',
        'zip_match',
        'domain_match'
    ]
                
pers_features = [
    'fname_match',
    'f_initial_match',
    'm_initial_match',
    'lname_match',
    'person_uniq',
    'suffix_match',
    'nick_matches',
    'phone_match',
    'before_domain_match'
]

# Get the columns that are features and are also present in our dataset right now
present_columns = pd.read_csv(ns("scored-3.csv"), nrows=1,encoding='utf-8',compression=compression).columns

# Get all the org features that are present
cand_org_features = present_columns[present_columns.isin(org_features)]
# Do the same with the contact features IF there are contact features
cand_pers_features = present_columns[present_columns.isin(pers_features)]

feature_avg_dict= {}
for feature in cand_pers_features:
    start_time = time.time()
    try:
        mode = pers_train[(pers_train[feature] != 1) & (pers_train[feature] != 0)][feature].mode()[0]
    except:
        mode = pers_train[feature].mean()
    feature_avg_dict.update({feature:mode})
    print(feature + " avg value added to dictionary --- %s seconds ---" % round(time.time() - start_time, 2))
for feature in cand_org_features:
    start_time = time.time()
    try:
        mode = org_train[(org_train[feature] != 1) & (org_train[feature] != 0)][feature].mode()[0]
    except:
        mode = org_train[feature].mean()
    feature_avg_dict.update({feature:mode})
    print(feature + " avg value added to dictionary --- %s seconds ---" % round(time.time() - start_time, 2))    

all_features = [] #creating list of all features used in ML for imputation
for value in cand_org_features: all_features.append(value)
for value in cand_pers_features: all_features.append(value)
    
'''#OLD WAY.  This code imputes based on the average value of assessment data (rather than of the training data)

#imputing np.nan values in ML features.  we need to do this because we want the model to treat differently
#cases where two records have a data type present and it DOES NOT MATCH and where one record is simply MISSING that data

#creating a dictionary we will use to determine the value to impute into feature NULLs
#####Arguably I should be using the avgs from the training set?  
#but really I think what I should try is creating a series of models to predict null values for each feature 
#based on training data
feature_avg_dict= {}
for feature in all_features:
    start_time = time.time()
    #should consider deleting this after using because of memory usage
    feature_df = pd.read_csv(ns("scored-3.csv"),usecols=[feature],encoding='utf-8',compression=compression)
    avg = feature_df[feature_df[feature].notnull()].mean()[0]
    feature_avg_dict.update({feature:avg})
    print(feature + " avg value added to dictionary --- %s seconds ---" % round(time.time() - start_time, 2))
'''
#right now I am imputing NULLs based on average values of testing set not training set.  I'm not sure this is proper
#BUT, it's probably fine.  
def impute_nulls(candidata):
    for feature in all_features:
        start_time = time.time()
        candidata[feature] = candidata[feature].fillna(feature_avg_dict[feature])
        print(feature + " imputed --- %s seconds ---" % round(time.time() - start_time, 2))
        #candidata[feature] = candidata[feature].replace(np.nan,feature_avg_dict[feature])

    return candidata

impute_chunks(pd.read_csv(ns("scored-3.csv"), chunksize=chunk_size,encoding='utf-8',compression=compression, dtype=object),
               impute_nulls,
               ns("scored-4.csv"),
               index=False)

In [None]:
feature_avg_dict

# Prediction

Use our training data to whip up some ML models and use them to predict our results. We generate one model based on oragnization deduplication training data, and one based on contact training data.

For contact matching/deduping, we evaluate with both models then create a composite score. For organization matching/deduping, we just use the organization model.

In [None]:
start_time = time.time()
print ("COMPOSITE SCORING, PREDICTING MATCHES...") #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

#fit org match model to training data
org_cls = LogisticRegression()      # Model organization data from organization dataset
org_cls.fit(org_train[cand_org_features], org_train['is_match'])

#fit person match model to training data
if not organization_only:
    pers_cls = LogisticRegression()     # Model person data from person dataset
    pers_cls.fit(pers_train[cand_pers_features], pers_train['is_match'])

"""
# Unused subpar ML models preserved for posterity
all_cls = RandomForestClassifier()      # Model both data together from person dataset
all_org_cls = RandomForestClassifier()  # Model organization data from both datasets
all_cls.fit(pers_train[list(cand_pers_features) + list(cand_org_features)], pers_train['is_match'])
all_org_cls.fit(pd.concat([org_train[cand_org_features], pers_train[cand_org_features]]),
                pd.concat([org_train['is_match'], pers_train['is_match']]))
"""

def predict(candidata, method="split-source"):
    if (method == "split-source"):
        # Train and fit on training data, then predict on our data
        #NOTE: the reason why we do not use org details from person training data is because the labels are for
        #whether the person matches, not the org.  so it may mislead the model to consider those labels.
        candidata['org_match_pred'] = org_cls.predict(candidata[cand_org_features])
        candidata['org_pred_proba'] = org_cls.predict_proba(candidata[cand_org_features])[:, 1]

        if not organization_only:
            # If this is a contact mode run, generate contact predictions and then create a composite score
            candidata['pers_match_pred'] = pers_cls.predict(candidata[cand_pers_features])
            candidata['pers_pred_proba'] = pers_cls.predict_proba(candidata[cand_pers_features])[:, 1]

            # Create a composite score by hand
            candidata['composite_match_proba'] = (candidata['pers_pred_proba'] * 2 + candidata['org_pred_proba']) / 3
            candidata['pred'] = candidata['composite_match_proba'] >= match_threshold
        else:
            # If this is an org mode run then just use the org mode predictions
            candidata['composite_match_proba'] = candidata['org_pred_proba']
            candidata['pred'] = candidata['org_match_pred']
        
    else: raise ValueError("Prediction method \"" + method + "\" not recognized.")
    
    return candidata

#print("Expected chunks:")
#print(int(pd.read_csv(ns('scored-4.csv')).shape[0] / chunk_size ))
#print('')
process_chunks(pd.read_csv(ns("scored-4.csv"), chunksize=chunk_size, index_col='pair',encoding='utf-8',compression=compression,
                           usecols=list(cand_pers_features) + list(cand_org_features) + ['id_l', 'id_r', 'pair'],
                           dtype={**dict(zip(list(cand_pers_features) + list(cand_org_features),
                                             [float] * len(list(cand_pers_features) + list(cand_org_features)))),
                                  **dict(zip(['id_l', 'id_r', 'pair'], [object] * 3))}),
               predict,
               ns("matched.csv")) #file containing features for each match pair, whether or not it is a predicted match

print("final matches isolated --- %s seconds ---" % (time.time() - start_time))

In [None]:
print("ISOLATING MATCHES...")
start_time = time.time()

# matched.csv is all the candidates; this filters out just the matches into matches.csv
process_chunks(pd.read_csv(ns("matched.csv"), chunksize=chunk_size * 10,encoding='utf-8',compression=compression),
               lambda chunk: chunk[chunk['pred'] == 1],
               ns("matches.csv"), #file containing features for each predicted match pair
               index=False)

print("final matches isolated --- %s seconds ---" % round(time.time() - start_time, 2))

### Grouping and Queening

In cases of deduplication, we designate groups of rows that are all considered duplicates of each other. Each group also gets a "queen" that contains the most authoritative information.

More information about how the queen is determined and about how groups are chosen are given in the comments of their respective functions.

In [None]:
import sys
sys.setrecursionlimit(5000) 

In [None]:
start_time = time.time()
print("GROUPING DUPLICATE RECORDS TOGETHER...")

# Set up the graph:
# Each edge represents a match between two records, and has the match_probability as a weight
# Each node represents a record, and has a "group_id" value that indicates what group it is in
# A group_id of 0 will indicate no group.
# Each node also has a "connection" value that indicates how connected it is to its group

#removed usecols argument
edges = pd.read_csv(ns("matches.csv"),usecols=['pair', 'id_l', 'id_r', 'composite_match_proba'],encoding='utf-8',compression=compression,
    dtype={'id_l': object, 'id_r': object, 'composite_match_proba': float})

# Gather up the IDs and the queen_score for each row.
if matching:
    # Here, for matches, we potentially have duplicate IDs across the main
    # and alt dataset, so we tag each row with a 'source' field of 'main'
    # or 'alt', then give all the rows combined a new index
    # (generated by using pd.concat(..., ignore_index=True)).
    data_left = pd.read_csv(ns("processed_l.csv"),encoding='utf-8',compression=compression,
                   dtype={'id': object, 'queen_score': float},
                   usecols=['id', 'queen_score']) \
         .assign(source="main")
    data_right = pd.read_csv(ns("processed_r.csv"),encoding='utf-8',compression=compression,
                   dtype={'id': object, 'queen_score': float},
                   usecols=['id', 'queen_score']) \
         .assign(source="alt")
    data = pd.concat([data_left, data_right], ignore_index=True)
    
    
    # Finally, we modify edges' 'id_l' and 'id_r' to reflect the new universal IDs
    # We create series that's just the universal index, with an index of the original
    # IDs. This way, if we call .to_dict() and create a dictionary of index->value, we
    # get a dictionary of original ID to universal ID.
    mainID_maps = pd.Series(data.index[data['source'] == 'main'],
                            index=data[data['source'] == 'main']['id'])
    altID_maps = pd.Series(data.index[data['source'] == 'alt'],
                           index=data[data['source'] == 'alt']['id'])

    edges['id_l'] = edges['id_l'].map(mainID_maps.to_dict())
    edges['id_r'] = edges['id_r'].map(altID_maps.to_dict())
else:
    data = pd.read_csv(ns("processed.csv"),encoding='utf-8',compression=compression,
                       dtype={'id': object, 'queen_score': float},
                       usecols=['id', 'queen_score']) \
             .set_index('id')

G = nx.convert_matrix.from_pandas_edgelist(edges, 'id_l', 'id_r', ['pair','composite_match_proba'])
nx.set_node_attributes(G, name="queen", values=False)
nx.set_node_attributes(G, name="group_id", values=0)
nx.set_node_attributes(G, name="connection", values=0)
nx.set_node_attributes(G, name="queen_score", values=data["queen_score"].to_dict())
nx.set_node_attributes(G, name="pair", values=0)

###########################################
##########  Graph Basics for Us  ##########
###########################################
# Our graph has nodes and edges
# Each node is the ID of a record
# Each edge is a match between two IDs
#
# Each node has a queen attribute indicating whether it's the queen of the group
# Each node has a group_id attribute indicating which group it's in, or 0 if none
# Each node has a connection attribute indicating its distance from the queen of
#     the group. This is calculated by the match probability times other match probabilities
#     on the path to the queen through the graph
# Each node has a queen_score attribute for reference indicating how much data it has. The
#     queen should have the highest queen_score in its group
# Each edge has a composite_match_proba attribute indicating how likely it is the two nodes
#     it connects are a match.
#
# x and y are node IDs, e.g. "11846309"
# G[x] gets us all nodes connected to x as a list of IDs
# G[x][y] gets us the edge connecting x and y in the form of a dictionary of edge attributes
# G.nodes[x] gets us the node x in the form of a dictionary of node attributes


# Start grouping
# Create a queen_score for each edge (aka the max queen score of the two nodes)
edges = edges.merge(data["queen_score"], how="left", left_on="id_l", right_index=True)
edges = edges.merge(data["queen_score"], how="left", left_on="id_r", right_index=True, suffixes=['_l', '_r'])
edges["queen_score"] = edges.apply(lambda row: max(row[["queen_score_l", "queen_score_r"]]), axis="columns")
# Sort primarily by descending match, then by descending queen score
edges = edges.sort_values(["composite_match_proba", "queen_score"], ascending=[False, False])

# Initalize the first group ID, and provide a function to update them
last_group = 1
def new_group():
    """
    This function hands out a group ID and updates the value
    so that the next call of new_group() will give a new group ID
    """
    global last_group
    group_id = last_group
    last_group += 1
    return group_id

total_rows = len(edges)  # Total rows to track percentage complete

# A basic overview:
# We go through all the matches (aka all the edges), starting with those
# with the highest queen_score and highest probability of match. That's
# because these will almost definitely be the matches that contain the queen
# and, essentially, they're easy pickings.
# As we go on down the list, we let the group membership "infect" other nodes
# as long as they're within the group_threshold.
#
# group_threshold score is calculated as the product of the match probabilities
# connecting any node in a group to that group's queen.
for current, row in enumerate(edges.itertuples(), 1):
    local_time = time.time()

    # Shortcut variables so I don't have to type so much:
    # left is a dictionary of node attributes of the node id_l
    # right is a dictionary of node attributes of the node id_r
    # edge is the composite_match_proba attribute of the shared edge (float)
    left = G.nodes[row.id_l]
    right = G.nodes[row.id_r]
    edge = G[row.id_l][row.id_r]['composite_match_proba']
    pair = G[row.id_l][row.id_r]['pair']

    # There are four possibilities for each edge
    # 1) Neither node has a group
    if left['group_id'] == 0 and right['group_id'] == 0:
        if edge < group_threshold:
            # This shouldn't happen as long as the
            # match threshold >= group threshold
            continue

        # First assign them their own group
        new_group_id = new_group()
        left['group_id'] = new_group_id
        right['group_id'] = new_group_id
        # Then pick who will be the de facto queen
        if (left['queen_score'] > right['queen_score']):
            left['connection'] = 1
            left['queen'] = True
            right['connection'] = edge
            right['pair'] = pair
            left['pair'] = pair
        else:
            right['connection'] = 1
            right['queen'] = True
            left['connection'] = edge
            right['pair'] = pair
            left['pair'] = pair

    # 2) Both nodes have the same group
    elif left['group_id'] != 0 and left['group_id'] == right['group_id']:
        # For both nodes, see if they would get a higher connection if
        # they connected to the queen through each other instead of
        # however they got connected in the first place.
        # This is unlikely to happen in practice.
        if right['connection'] < left['connection'] * edge:
            right['connection'] = left['connection'] * edge
            right['pair'] = pair
            #left['pair'] = pair
        elif left['connection'] < right['connection'] * edge:
            left['connection'] = right['connection'] * edge
            left['pair'] = pair
            #right['pair'] = pair

    # 3) Only one node has a group
    elif left['group_id'] != 0 and right['group_id'] == 0:
        # Check if the ungrouped node could
        # connect to the grouped node
        if left['connection'] * edge >= group_threshold:
            right['group_id'] = left['group_id']
            right['connection'] = left['connection'] * edge
            right['pair'] = pair
            #left['pair'] = pair
    elif right['group_id'] != 0 and left['group_id'] == 0:
        # Check if the ungrouped node could
        # connect to the grouped node
        if right['connection'] * edge >= group_threshold:
            left['group_id'] = right['group_id']
            left['connection'] = right['connection'] * edge
            left['pair'] = pair
            #right['pair'] = pair

    # 4) The two nodes are in different groups
    #
    # When a node switches groups, we should check its old groupmates
    # and see if they also want to switch groups. Practically, this won't
    # happen because how quickly the connection to the queen degrades. But
    # this will be a recursive function, checking neighbors and neighbors of
    # neighbors.
    elif (left['group_id'] != 0 and right['group_id'] != 0 and
          left['group_id'] != right['group_id']):
        def cascade_conversion(converted_id):
            """
            Checks to see if the neighbors of converted_id would get better
            connections if they also converted to the same group as converted_id.
            """
            converted = G.nodes[converted_id]
            for neighbor_id in G[converted_id]:
                neighbor = G.nodes[neighbor_id]
                edge = G[converted_id][neighbor_id]["composite_match_proba"]
                if neighbor['group_id'] == converted['group_id']:
                    continue
                if neighbor['connection'] < converted['connection'] * edge * group_merge_limiter: #discourages/encourages groups from merging
                    neighbor['group_id'] = converted['group_id']
                    neighbor['connection'] = converted['connection'] * edge
                    #neighbor['pair'] = pair
                    #converted['pair'] = pair
                    cascade_conversion(neighbor_id)

        # Check if each node would have a better connection with
        # the other node's group, and change accordingly
        if right['connection'] < left['connection'] * edge * group_merge_limiter: #discourages/encourages groups from merging
            right['group_id'] = left['group_id']
            right['connection'] = left['connection'] * edge
            #right['pair'] = pair
            #left['pair'] = pair
            cascade_conversion(row.id_r)
        elif left['connection'] < right['connection'] * edge * group_merge_limiter: #discourages/encourages groups from merging
            left['group_id'] = right['group_id']
            left['connection'] = right['connection'] * edge  
            #left['pair'] = pair
            #right['pair'] = pair
            cascade_conversion(row.id_l)
    else:
        # This... shouldn't ever ever happen
        # Pretty sure the four cases above cover all possible
        # outcomes. But if we do come across something, raise
        # an error and print out some details about where it happened.
        details = repr(G.nodes[row.id_l]) + \
                  repr(G.nodes[row.id_r]) + \
                  repr(G[row.id_l][row.id_r])
        raise ValueError("Didn't expect an excerpt " + details)

# Consolidate graph group data back into dataframes
# G.nodes.__getitem__(x) = G.nodes[x], and we *map* it to every node in G.nodes
groups = pd.DataFrame.from_records(list(map(G.nodes.__getitem__, G.nodes)), index=list(G.nodes))
groups = groups.rename({'connection': 'queen_proba'}, axis='columns')

# Join with the processed data
if matching:
    data_left = pd.read_csv(ns("processed_l.csv"), encoding='utf-8',compression=compression,dtype=object) \
         .assign(source="main")
    data_right = pd.read_csv(ns("processed_r.csv"), encoding='utf-8',compression=compression,dtype=object) \
         .assign(source="alt")
    data = pd.concat([data_left, data_right], ignore_index=True)
    grouped = data.join(groups[['pair','group_id', 'queen_proba', 'queen']])
    grouped = grouped.set_index(['source', 'id'])
else:
    data = pd.read_csv(ns("processed.csv"), encoding='utf-8',compression=compression,dtype=object).set_index('id')
    grouped = data.join(groups[['pair','group_id', 'queen_proba', 'queen']])

grouped = grouped.sort_values(['group_id', 'queen_proba', 'queen_score'], ascending=[True, False, False])
grouped.to_csv(ns("grouped.csv"))

print("Grouped all matches --", time.time() - start_time, "seconds")
grouped[grouped.group_id != 0].head(15)

In [None]:
print("CREATE SENSIBLE DEDUP SCORE...")
start_time = time.time()

grouped = pd.read_csv(ns("grouped.csv"))
# Initialize dedup_score with queen_proba
grouped['dedup_score'] = grouped['queen_proba']

#handle for scenario where group_merge_limiter > 1 which can create multiple queens for same group
if group_merge_limiter >1:
    for g in grouped.group_id.drop_duplicates():
        if grouped[(grouped.group_id == g) & (grouped.queen == True)].shape[0] > 1:
            n=0
            for index, row in grouped[grouped.group_id == g].iterrows():
                if n==0:
                    n=1 #skip the first row which is the true queen
                else:
                    grouped.at[index,'queen'] = False
        if grouped[grouped.group_id == g].shape[0] == 1: #this should really be fixed during clustering step, but I am being lazy
            for index, row in grouped[grouped.group_id == g].iterrows():
                grouped.at[index,'group_id'] = 0
                grouped.at[index,'queen'] = False
    grouped = grouped.sort_values(['group_id', 'queen_proba', 'queen_score'], ascending=[True, False, False])
                    
# Take all non-queens in valid groups (all group_ids > 0), find max score by group
queens_score = grouped[(grouped['queen'] == False) & (grouped['group_id'] != 0)] \
    .groupby('group_id')['queen_proba'].max()
# Assign back to all queens. Since the order is retained, we shouldn't have to
# worry about assigning to the wrong group queen
grouped.loc[grouped['queen'] == True, 'dedup_score'] = queens_score.values

grouped['id'] = grouped['id'].astype(str)

print("Dedup scores generated --- %s seconds ---" % round(time.time() - start_time, 2))

In [None]:
#thoughts about entity resolver:
# isolate queens, compare all queens to each other.  If match > merge_variable (suggest .9) then merge corresponding groups
#### Actually, you can't compare all queens to each other.  Imagine there are 20000 groups, that is too many comparisons
#so that means I probably need to redo the full process? OR I reuse ungrouped matches from scored files
#but then the downside is I would be missing the exclusions.  I need to test on a real dataset to see how well it works.
# This comparison will work around exclusions (eg. gender mis-match) so these need to be added to features for reference...
# ^^not true if I am using only already vetted matches from the scored-4 file.  That is probably the simplest path forward
#do this in waves, begining one at a time working through all groups.  
#use some dictionary to convert old queen members to the new group ID
#stop waves when 0 groups are matched at or above merge_variable

In [None]:
grouped

In [None]:
if entity_ID is True: 
    #we will do multiple passes of the grouping step, each time taking only the queens from the previous step
    #then we'll be adding multiple group ID columns and queen coumns for each pass.  
    #the purpose of this is to resolve cases where multiple group_ids from the original pass should really be merged
    #having multiple passes allows for the data owner to review and determine the optimal level of grouping granularity
    max_passes = 5
    grouped = grouped.set_index('id')
    matched = pd.read_csv(ns("matched.csv"),usecols=['pair', 'id_l', 'id_r', 'composite_match_proba'],encoding='utf-8',compression=compression,
            dtype={'id_l': object, 'id_r': object, 'composite_match_proba': float})
    queen_col = 'queen'
    group_col = 'group_id'
    queen_proba_col = 'queen_proba'
    dedup_score_col = 'dedup_score'
    pair_col = 'pair'
    
    rounds = []
    for p in range(max_passes):
        start_time = time.time()

        # Set up the graph:
        # Each edge represents a match between two records, and has the match_probability as a weight
        # Each node represents a record, and has a "group_id" value that indicates what group it is in
        # A group_id of 0 will indicate no group.
        # Each node also has a "connection" value that indicates how connected it is to its group

        #only using queens from the previous round of grouping
        edges = matched[(matched.id_l.isin(grouped[grouped[queen_col]==True].index)) & (matched.id_r.isin(grouped[grouped[queen_col]==True].index))]
        
        if edges.shape[0] > 0:
            print("GROUPING DUPLICATE GROUPS TOGETHER...{}".format(group_col))
            queen_col = queen_col.split()[0] + ' ' + str(p+2)
            group_col = group_col.split()[0] + ' ' + str(p+2)
            queen_proba_col = queen_proba_col.split()[0] + ' ' + str(p+2)
            dedup_score_col = dedup_score_col.split()[0] + ' ' + str(p+2)
            pair_col = pair_col.split()[0] + ' ' + str(p+2)
            
            rounds.append(group_col) #we'll iterate through this list later to flesh out the dedup scores and groups

            '''#not sure this section is needed?

            # Gather up the IDs and the queen_score for each row.
            if matching:
                # Here, for matches, we potentially have duplicate IDs across the main
                # and alt dataset, so we tag each row with a 'source' field of 'main'
                # or 'alt', then give all the rows combined a new index
                # (generated by using pd.concat(..., ignore_index=True)).
                data_left = pd.read_csv(ns("processed_l.csv"),encoding='utf-8',compression=compression,
                               dtype={'id': object, 'queen_score': float},
                               usecols=['id', 'queen_score']) \
                     .assign(source="main")
                data_right = pd.read_csv(ns("processed_r.csv"),encoding='utf-8',compression=compression,
                               dtype={'id': object, 'queen_score': float},
                               usecols=['id', 'queen_score']) \
                     .assign(source="alt")
                data = pd.concat([data_left, data_right], ignore_index=True)


                # Finally, we modify edges' 'id_l' and 'id_r' to reflect the new universal IDs
                # We create series that's just the universal index, with an index of the original
                # IDs. This way, if we call .to_dict() and create a dictionary of index->value, we
                # get a dictionary of original ID to universal ID.
                mainID_maps = pd.Series(data.index[data['source'] == 'main'],
                                        index=data[data['source'] == 'main']['id'])
                altID_maps = pd.Series(data.index[data['source'] == 'alt'],
                                       index=data[data['source'] == 'alt']['id'])

                edges['id_l'] = edges['id_l'].map(mainID_maps.to_dict())
                edges['id_r'] = edges['id_r'].map(altID_maps.to_dict())
            else:
                data = pd.read_csv(ns("processed.csv"),encoding='utf-8',compression=compression,
                                   dtype={'id': object, 'queen_score': float},
                                   usecols=['id', 'queen_score']) \
                         .set_index('id')
             '''

            G = nx.convert_matrix.from_pandas_edgelist(edges, 'id_l', 'id_r', ['pair','composite_match_proba'])
            nx.set_node_attributes(G, name=queen_col, values=False)
            nx.set_node_attributes(G, name=group_col, values=0)
            nx.set_node_attributes(G, name="connection", values=0)
            nx.set_node_attributes(G, name="queen_score", values=data["queen_score"].to_dict())
            nx.set_node_attributes(G, name=pair_col, values=0)

            ###########################################
            ##########  Graph Basics for Us  ##########
            ###########################################
            # Our graph has nodes and edges
            # Each node is the ID of a record
            # Each edge is a match between two IDs
            #
            # Each node has a queen attribute indicating whether it's the queen of the group
            # Each node has a group_id attribute indicating which group it's in, or 0 if none
            # Each node has a connection attribute indicating its distance from the queen of
            #     the group. This is calculated by the match probability times other match probabilities
            #     on the path to the queen through the graph
            # Each node has a queen_score attribute for reference indicating how much data it has. The
            #     queen should have the highest queen_score in its group
            # Each edge has a composite_match_proba attribute indicating how likely it is the two nodes
            #     it connects are a match.
            #
            # x and y are node IDs, e.g. "11846309"
            # G[x] gets us all nodes connected to x as a list of IDs
            # G[x][y] gets us the edge connecting x and y in the form of a dictionary of edge attributes
            # G.nodes[x] gets us the node x in the form of a dictionary of node attributes


            # Start grouping
            # Create a queen_score for each edge (aka the max queen score of the two nodes)
            edges = edges.merge(data["queen_score"], how="left", left_on="id_l", right_index=True)
            edges = edges.merge(data["queen_score"], how="left", left_on="id_r", right_index=True, suffixes=['_l', '_r'])
            edges["queen_score"] = edges.apply(lambda row: max(row[["queen_score_l", "queen_score_r"]]), axis="columns")
            # Sort primarily by descending match, then by descending queen score
            edges = edges.sort_values(["composite_match_proba", "queen_score"], ascending=[False, False])

            # Initalize the first group ID, and provide a function to update them
            last_group = 1
            def new_group():
                """
                This function hands out a group ID and updates the value
                so that the next call of new_group() will give a new group ID
                """
                global last_group
                group_id = last_group
                last_group += 1
                return group_id

            total_rows = len(edges)  # Total rows to track percentage complete

            # A basic overview:
            # We go through all the matches (aka all the edges), starting with those
            # with the highest queen_score and highest probability of match. That's
            # because these will almost definitely be the matches that contain the queen
            # and, essentially, they're easy pickings.
            # As we go on down the list, we let the group membership "infect" other nodes
            # as long as they're within the group_threshold.
            #
            # group_threshold score is calculated as the product of the match probabilities
            # connecting any node in a group to that group's queen.
            for current, row in enumerate(edges.itertuples(), 1):
                local_time = time.time()

                # Shortcut variables so I don't have to type so much:
                # left is a dictionary of node attributes of the node id_l
                # right is a dictionary of node attributes of the node id_r
                # edge is the composite_match_proba attribute of the shared edge (float)
                left = G.nodes[row.id_l]
                right = G.nodes[row.id_r]
                edge = G[row.id_l][row.id_r]['composite_match_proba']
                pair = G[row.id_l][row.id_r]['pair']

                # There are four possibilities for each edge
                # 1) Neither node has a group
                if left[group_col] == 0 and right[group_col] == 0:
                    if edge < group_threshold:
                        # This shouldn't happen as long as the
                        # match threshold >= group threshold
                        continue

                    # First assign them their own group
                    new_group_id = new_group()
                    left[group_col] = new_group_id
                    right[group_col] = new_group_id
                    # Then pick who will be the de facto queen
                    if (left['queen_score'] > right['queen_score']):
                        left['connection'] = 1
                        left[queen_col] = True
                        right['connection'] = edge
                        right[pair_col] = pair
                        left[pair_col] = pair
                    else:
                        right['connection'] = 1
                        right[queen_col] = True
                        left['connection'] = edge
                        right[pair_col] = pair
                        left[pair_col] = pair

                # 2) Both nodes have the same group
                elif left[group_col] != 0 and left[group_col] == right[group_col]:
                    # For both nodes, see if they would get a higher connection if
                    # they connected to the queen through each other instead of
                    # however they got connected in the first place.
                    # This is unlikely to happen in practice.
                    if right['connection'] < left['connection'] * edge:
                        right['connection'] = left['connection'] * edge
                        right[pair_col] = pair
                        #left['pair'] = pair
                    elif left['connection'] < right['connection'] * edge:
                        left['connection'] = right['connection'] * edge
                        left[pair_col] = pair
                        #right['pair'] = pair

                # 3) Only one node has a group
                elif left[group_col] != 0 and right[group_col] == 0:
                    # Check if the ungrouped node could
                    # connect to the grouped node
                    if left['connection'] * edge >= group_threshold:
                        right[group_col] = left[group_col]
                        right['connection'] = left['connection'] * edge
                        right[pair_col] = pair
                        #left['pair'] = pair
                elif right[group_col] != 0 and left[group_col] == 0:
                    # Check if the ungrouped node could
                    # connect to the grouped node
                    if right['connection'] * edge >= group_threshold:
                        left[group_col] = right[group_col]
                        left['connection'] = right['connection'] * edge
                        left[pair_col] = pair
                        #right['pair'] = pair

                # 4) The two nodes are in different groups
                #
                # When a node switches groups, we should check its old groupmates
                # and see if they also want to switch groups. Practically, this won't
                # happen because how quickly the connection to the queen degrades. But
                # this will be a recursive function, checking neighbors and neighbors of
                # neighbors.
                elif (left[group_col] != 0 and right[group_col] != 0 and
                      left[group_col] != right[group_col]):
                    def cascade_conversion(converted_id):
                        """
                        Checks to see if the neighbors of converted_id would get better
                        connections if they also converted to the same group as converted_id.
                        """
                        converted = G.nodes[converted_id]
                        for neighbor_id in G[converted_id]:
                            neighbor = G.nodes[neighbor_id]
                            edge = G[converted_id][neighbor_id]["composite_match_proba"]
                            if neighbor[group_col] == converted[group_col]:
                                continue
                            if neighbor['connection'] < converted['connection'] * edge * group_merge_limiter: #discourages/encourages groups from merging
                                neighbor[group_col] = converted[group_col]
                                neighbor['connection'] = converted['connection'] * edge
                                #neighbor['pair'] = pair
                                #converted['pair'] = pair
                                cascade_conversion(neighbor_id)

                    # Check if each node would have a better connection with
                    # the other node's group, and change accordingly
                    if right['connection'] < left['connection'] * edge * group_merge_limiter: #discourages/encourages groups from merging
                        right[group_col] = left[group_col]
                        right['connection'] = left['connection'] * edge
                        #right['pair'] = pair
                        #left['pair'] = pair
                        cascade_conversion(row.id_r)
                    elif left['connection'] < right['connection'] * edge * group_merge_limiter: #discourages/encourages groups from merging
                        left[group_col] = right[group_col]
                        left['connection'] = right['connection'] * edge  
                        #left['pair'] = pair
                        #right['pair'] = pair
                        cascade_conversion(row.id_l)
                else:
                    # This... shouldn't ever ever happen
                    # Pretty sure the four cases above cover all possible
                    # outcomes. But if we do come across something, raise
                    # an error and print out some details about where it happened.
                    details = repr(G.nodes[row.id_l]) + \
                              repr(G.nodes[row.id_r]) + \
                              repr(G[row.id_l][row.id_r])
                    raise ValueError("Didn't expect an excerpt " + details)

            # Consolidate graph group data back into dataframes
            # G.nodes.__getitem__(x) = G.nodes[x], and we *map* it to every node in G.nodes
            groups = pd.DataFrame.from_records(list(map(G.nodes.__getitem__, G.nodes)), index=list(G.nodes))
            groups = groups.rename({'connection': queen_proba_col}, axis='columns')

            # Join with the processed data
            if matching:
                data_left = pd.read_csv(ns("processed_l.csv"), encoding='utf-8',compression=compression,dtype=object) \
                     .assign(source="main")
                data_right = pd.read_csv(ns("processed_r.csv"), encoding='utf-8',compression=compression,dtype=object) \
                     .assign(source="alt")
                data = pd.concat([data_left, data_right], ignore_index=True)
                grouped = data.join(groups[[pair_col,group_col, queen_proba_col, queen_col]])
                grouped = grouped.set_index(['source', 'id'])
            else:
                #data = pd.read_csv(ns("processed.csv"), encoding='utf-8',compression=compression,dtype=object).set_index('id')
                grouped = grouped.join(groups[[pair_col,group_col, queen_proba_col, queen_col]])

            grouped = grouped.sort_values([group_col, queen_proba_col, 'queen_score'], ascending=[True, False, False])

            grouped[dedup_score_col] = grouped[queen_proba_col]

            #handle for scenario where group_merge_limiter > 1 which can create multiple queens for same group
            if group_merge_limiter >1:
                for g in grouped[group_col].drop_duplicates():
                    if grouped[(grouped[group_col] == g) & (grouped[queen_col] == True)].shape[0] > 1:
                        n=0
                        for index, row in grouped[grouped[group_col] == g].iterrows():
                            if n==0:
                                n=1 #skip the first row which is the true queen
                            else:
                                grouped.at[index,queen_col] = False
                    if grouped[grouped[group_col] == g].shape[0] == 1: #this should really be fixed during clustering step, but I am being lazy
                        for index, row in grouped[grouped[group_col] == g].iterrows():
                            grouped.at[index,group_col] = 0
                            grouped.at[index,queen_cpl] = False
                grouped = grouped.sort_values([group_col, queen_proba_col, 'queen_score'], ascending=[True, False, False])

            # Take all non-queens in valid groups (all group_ids > 0), find max score by group
            queens_score = grouped[(grouped[queen_col] == False) & (grouped[group_col] != 0)] \
                .groupby(group_col)[queen_proba_col].max()
            # Assign back to all queens. Since the order is retained, we shouldn't have to
            # worry about assigning to the wrong group queen
            grouped.loc[grouped[queen_col] == True, dedup_score_col] = queens_score.values
            
        else:
            #do nothing if edges is empty, process will continue untill the end of range.  probably a better way to do this.
            edges
            
    #now some code to flesh out all the dedup scores for groups 2, 3, etc.
    for r in rounds: 
        round_number = r[-1]
        queen = 'queen ' + str(round_number)
        dedup_score = 'dedup_score ' + str(round_number)
        if round_number == '2':
            previous_round = 'group_id'
            previous_queen = 'queen'
            previous_dedup_score = 'dedup_score'
        else:
            previous_round = 'group_id ' + str(int(round_number)-1)
            previous_queen = 'queen ' + str(int(round_number)-1)
            previous_dedup_score = 'dedup_score ' + str(int(round_number)-1)
        
        #filling in groups, dedup scores for re-grouped groups
        for g in grouped[(grouped[r].notnull()) & (grouped[r] != 0)][r].drop_duplicates():
            for p in grouped[grouped[r] == g][previous_round].drop_duplicates():
                queen_proba = grouped[(grouped[previous_round]==p) & (grouped[previous_queen]==True)][dedup_score][0]
                for index, row in grouped[(grouped[previous_round]==p) & (grouped[previous_queen]==False)].iterrows():
                    grouped.at[index,r] = g #fill in the group_id
                    grouped.at[index,queen] = False
                    grouped.at[index,dedup_score] = row[previous_dedup_score] * queen_proba #calculate match probability 
                    
        #filling in groups, dedup scores for the blanks in the new regrouped columns to provide cohesive perspective in each
        max_g = grouped[grouped[r].notnull()][r].max()
        for p in grouped[((grouped[previous_round].notnull() & (grouped[previous_round] != 0))) 
                        & ((grouped[r].isnull()) | (grouped[r] == 0))][previous_round].drop_duplicates():
            for index, row in grouped[grouped[previous_round] == p].iterrows():
                grouped.at[index,r] = max_g + 1
                grouped.at[index,queen] = row[previous_queen]
                grouped.at[index,dedup_score] = row[previous_dedup_score]
            max_g = max_g + 1
            
        
    grouped=grouped.reset_index()
    grouped.to_csv(ns("grouped.csv"))
    print("Grouped all matches --", time.time() - start_time, "seconds")

else:
    #return to original column names where appropriate
    for col in grouped.columns:
        if (col in cols):
            grouped.rename(columns={col:cols[col]},inplace=True)

    if matching:
        for col in grouped.columns:
            if (col in cols_alt) & (col != 'id'):
                grouped.rename(columns={col:cols_alt[col]},inplace=True)
    
    grouped.to_csv(ns("grouped.csv"), index=False)
         

In [None]:
if not matching: #still need to figure out how I am handling matching
    print("MERGE DATA MATCH ELEMENTS WITH ORIGINAL FILES")
    start_time = time.time()
    
    #saving the cols dictionary for later use
    cols_df = pd.DataFrame()
    cols_df['original_column_name'] = cols.values()
    cols_df['standard_column_name'] = cols.keys()
    cols_df.dropna(inplace=True)
    cols_df.to_csv(ns('cols_df.csv'),index=False)
    
    #isolating any new columns to append to original data
    original_df = pd.read_table(data_details['filepath_or_buffer'],sep=data_details['sep'],converters=data_details['converters'])
    start_cols = pd.read_table(data_details['filepath_or_buffer'],sep=data_details['sep'],converters=data_details['converters'],nrows=0).shape[1]
    end_cols = grouped.shape[1]
    new_cols = list(grouped.iloc[:,end_cols-(end_cols-start_cols):].columns)
    new_cols.append(cols['id'])
     
    original_df[cols['id']] = original_df[cols['id']].astype(str)
    #added below code to handle scenario where there was 1 NULL value for some reason, messing up typing of column for join
    if grouped[cols['id']].isnull().sum() >0:
        grouped = pd.read_csv(ns('grouped.csv'))
        grouped.dropna(subset=[cols['id']],inplace=True)
        grouped[cols['id']] = grouped[cols['id']].astype(float)
        grouped[cols['id']] = grouped[cols['id']].astype(int)
    
    grouped[cols['id']] = grouped[cols['id']].astype(str)
    original_w_grouped = pd.merge(original_df,grouped[new_cols],left_on=cols['id'],right_on=cols['id'])

    print("Grouped features appended to original data --- %s seconds ---" % round(time.time() - start_time, 2))
    original_w_grouped[original_w_grouped.group_id != 0].sort_values(by=['group_id','queen_proba'],ascending=[True,False])

In [None]:
#combining grouped file with match features
if not matching:
    print("MERGE GROUPED WITH ORIGINAL MATCH FEATURES")
    start_time = time.time()
    matches = pd.read_csv(ns('matches.csv'),compression=compression)
    merged = pd.merge(original_w_grouped, matches, how='left',left_on='pair',right_on='pair')
    merged=merged.sort_values(by=['group_id', 'queen_proba'], ascending=[True, False])
    merged.to_csv(ns('grouped_w_features.csv'),index=False)
    merged.to_excel(ns('grouped_w_features.xlsx'),index=False)
    try:
        string_columns_df.to_csv(ns('string_columns.csv'),index=False)
    except:
        print("no string columns df available")
    print("Match features appended to original data --- %s seconds ---" % round(time.time() - start_time, 2))

In [None]:
merged[merged.group_id != 0]

In [None]:
cols