<div class="alert" style="background-color:#fff; color:white; padding:0px 10px; border-radius:5px;"><h1 style='margin:15px 15px; color:#006a79; font-size:40px'>Customer Account Identification through Demographics</h1>
</div>

In [1]:
# Importing all required libraries

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from itertools import permutations, combinations
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline

import matplotlib.pyplot as plt
import json
import nltk
import re
%matplotlib inline
import seaborn as sns

<div class="alert alert-info" style="background-color:#006a79; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'>User Defined Functions</h2>
</div>

In [2]:
# Identify null values for all the columns in dataframe

def getNullStats(df):
    tbl_results = []
    print('Total Features(Columns) of dataset = ', len(df.columns))
    total_samples = len(df)
    null_samples = df.isnull().sum()
    tbl_results = pd.concat([null_samples, round(null_samples/total_samples*100, 2)], axis=1)
    tbl_results = tbl_results.rename(columns = {0:'Nulls', 1:'Percent'})
    tbl_results = tbl_results[tbl_results.iloc[:, 1] !=0].sort_values('Nulls', ascending=False).round(2)
    print('Null Features(Columns) of dataset = ', len(tbl_results))
    return tbl_results

In [3]:
# Clearn JSON structure

def clean_json(x):
# "Create apply function for decoding JSON"
    return json.loads(x) 

In [4]:
# Check if 'subset' is sub set of 'supset'

def isSubset(df, supset, subset):
    # Get max width of subset cell
    # width_subset = int(df[subset].str.encode(encoding='utf-8').str.len().max())
    width_subset = 11
        
    # Get indices of non null rows of subset
    subset_list = df[subset][~df[subset].isnull()].index
    
    df_supset = (df[supset].astype('str')).str[:width_subset][subset_list]
    #print(df_supset)
    
    df_subset = df[subset].str[:width_subset][subset_list]
    #print(df_subset)
    
    return df_supset.equals(df_subset)

In [5]:
def tokenize(subject):
    tokens = [word.lower() for sent in nltk.sent_tokenize(subject) for word in nltk.word_tokenize(sent)]
    
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token) and (len(token) > 1):
            filtered_tokens.append(token)

    return filtered_tokens

In [6]:
def getPermutations(subject, n):
    lstTokens = tokenize(subject)
    lstTuples = list(permutations(lstTokens, n))
    lstItems = []
    lstItems.extend([' '.join(item) for item in lstTuples])
    
    return lstItems

In [7]:
def flat(lst):
    if isinstance(lst, list):
        for item in lst:
            yield from flat(item)
    else:        
        yield lst

In [8]:
def getAllPermutations(subject):
    lstItems = []
    final = []

    tokens = tokenize(subject)
    subject_length_words = len(tokens)
    lstItems.extend(getPermutations(subject, int(i+1)) for i in np.arange(int(subject_length_words)))
    
    final.append([item for item in lstItems])
    return list(flat(final))

In [9]:
def kMeansClustering(tfidf_mat, nClusters):
    km = KMeans(n_clusters=nClusters)
    %time km.fit(tfidf_mat)
    clusters = km.labels_.tolist()
    return km

In [10]:
import warnings
warnings.filterwarnings("ignore")

In [11]:
# pd.set_option('display.max_columns', None)
# pd.set_option('display.min_rows', 130)
# pd.set_option('display.max_rows', None)
# pd.set_option('display.width', 10000)
pd.set_option('display.max_colwidth', 0)

In [12]:
# Read customer data
dm_df = pd.read_csv('cust360_customer_demographics_linked_1000.csv',  sep='|')

# Include header in the dataframe
dm_df.columns=[
    'cust_id', 'acct_id', 'cm11', 'cm13', 'cm15', 'alt_acct_id', 'parnt_acct_no', 'mbr_rwrd_id',
    'setup_refer_acct_id', 'emb_indv_nm', 'emb_prcs_indv_nm', 'indv_prim_ttl_nm',
    'indv_prim_pfx_nm', 'indv_prim_first_nm', 'indv_prim_mid_nm', 'indv_prim_lst_nm',
    'indv_prim_suff_nm', 'indv_prim_add_lst_nm', 'indv_prim_full_nm', 'indv_prim_prcs_full_nm',
    'indv_prim_prcs_first_nm', 'indv_prim_prcs_mid_nm', 'indv_prim_prcs_lst_nm', 'indv_scnd_nm',
    'lgl_nm', 'lgl_prcs_nm', 'gend_cd', 'cust_dob', 'birth_yr', 'prcs_dob', 'gov_doc_id',
    'ad_home_line_care', 'ad_home_st_line1', 'ad_home_st_line2', 'ad_home_st_line3',
    'ad_home_st_line4', 'ad_home_prcs_st_line1', 'ad_home_prcs_st_line2',
    'ad_home_prcs_st_line3', 'ad_home_prcs_st_line4', 'ad_home_geo_coord_latd',
    'ad_home_geo_coord_longt', 'ad_bus_line_care', 'ad_bus_st_line1', 'ad_bus_st_line2',
    'ad_bus_st_line3', 'ad_bus_st_line4', 'ad_bus_prcs_st_line1', 'ad_bus_prcs_st_line2',
    'ad_bus_prcs_st_line3', 'ad_bus_prcs_st_line4', 'ad_bus_geo_coord_latd',
    'ad_bus_geo_coord_longt', 'ad_alt_line_care', 'ad_alt_st_line1', 'ad_alt_st_line2',
    'ad_alt_st_line3', 'ad_alt_st_line4', 'ad_alt_prcs_st_line1', 'ad_alt_prcs_st_line2',
    'ad_alt_prcs_st_line3', 'ad_alt_prcs_st_line4', 'ad_alt_geo_coord_latd',
    'ad_alt_geo_coord_longt', 'ad_temp_line_care', 'ad_temp_st_line1', 'ad_temp_st_line2',
    'ad_temp_st_line3', 'ad_temp_st_line4', 'ad_temp_prcs_st_line1', 'ad_temp_prcs_st_line2',
    'ad_temp_prcs_st_line3', 'ad_temp_prcs_st_line4', 'ad_temp_geo_coord_latd',
    'ad_temp_geo_coord_longt', 'ad_other_line_care', 'ad_other_st_line1', 'ad_other_st_line2',
    'ad_other_st_line3', 'ad_other_st_line4', 'ad_other_prcs_st_line1', 'ad_other_prcs_st_line2',
    'ad_other_prcs_st_line3', 'ad_other_prcs_st_line4', 'ad_other_geo_coord_latd',
    'ad_other_geo_coord_longt', 'ad_add', 'home_phone_no', 'home_prcs_phone_no',
    'home_prcs_full_phone_no', 'alt_home_phone_no', 'alt_home_prcs_phone_no',
    'alt_home_prcs_full_phone_no', 'bus_phone_no', 'bus_prcs_phone_no',
    'bus_prcs_full_phone_no', 'alt_bus_phone_no', 'alt_bus_prcs_phone_no',
    'alt_bus_prcs_full_phone_no', 'mob_phone_no', 'mob_prcs_phone_no', 'mob_prcs_full_phone_no',
    'alt_mob_phone_no', 'alt_mob_prcs_phone_no', 'alt_mob_prcs_full_phone_no',
    'atty_phone_no', 'atty_prcs_phone_no', 'atty_prcs_full_phone_no', 'fax_no', 'prcs_fax_no',
    'prcs_full_fax_no', 'phone_ani', 'other_phone_no', 'other_prcs_phone_no',
    'other_prcs_full_phone_no', 'add_phone', 'srvc_email_ad', 'srvc_prcs_email_ad',
    'estmt_email_ad', 'estmt_prcs_email_ad', 'other_email_ad', 'bank_prim_acct_no',
    'bank_prim_rte_no', 'bank_prim_iban_no', 'bank_scnd_acct_no', 'bank_scnd_rte_no',
    'bank_scnd_iban_no'
]
# dm_df.head()

Notice empty cells are filled with \N character, which should be replaced with Nulls. Lets fill with Null as required

In [13]:
# Replacing \N from all cells source data with NaN/Nulls

dm_df.replace('\\N',np.nan,inplace=True)
# dm_df.head()

<font color='#006a79'> **Group independent features/columns based on domain knowledge and cursory glance over data**</font>

Let us group columns based on **Entities** to which they belong as shown below:
- **Customer entity**: 
        'cust_id' (Leave untouched for future use)


- **Account entity**:  
        'acct_id', 'cm11', 'cm13', 'cm15', 'alt_acct_id', 
        'parnt_acct_no', 'mbr_rwrd_id', 'setup_refer_acct_id',
        'bank_prim_acct_no', 'bank_prim_rte_no', 'bank_prim_iban_no',
        'bank_scnd_acct_no', 'bank_scnd_rte_no', 'bank_scnd_iban_no'

- **Name entity**:
        'emb_indv_nm', 'emb_prcs_indv_nm', 
        'indv_prim_first_nm', 'indv_prim_mid_nm', 'indv_prim_lst_nm', 'indv_prim_full_nm', 
        'indv_prim_prcs_first_nm', 'indv_prim_prcs_mid_nm', 'indv_prim_prcs_lst_nm', 'indv_prim_prcs_full_nm',
        'indv_prim_ttl_nm', 'indv_prim_pfx_nm', 'indv_prim_suff_nm', 
        'indv_prim_add_lst_nm', 'indv_scnd_nm',
        'lgl_nm', 'lgl_prcs_nm'
             
- **Gender entity**:  
        'gend_cd'


- **Date of Birth entity**:  
        'cust_dob', 'birth_yr', 'prcs_dob'


- **Government Document entity**:  
        'gov_doc_id'


- **Address entity**:  
        'ad_home_line_care', 'ad_home_st_line1', 'ad_home_st_line2', 'ad_home_st_line3', 'ad_home_st_line4', 
        'ad_home_prcs_st_line1', 'ad_home_prcs_st_line2', 'ad_home_prcs_st_line3', 'ad_home_prcs_st_line4', 
        'ad_home_geo_coord_latd', 'ad_home_geo_coord_longt', 

        'ad_bus_line_care', 'ad_bus_st_line1', 'ad_bus_st_line2', 'ad_bus_st_line3', 'ad_bus_st_line4', 
        'ad_bus_prcs_st_line1', 'ad_bus_prcs_st_line2', 'ad_bus_prcs_st_line3', 'ad_bus_prcs_st_line4', 
        'ad_bus_geo_coord_latd', 'ad_bus_geo_coord_longt', 

        'ad_alt_line_care', 'ad_alt_st_line1', 'ad_alt_st_line2', 'ad_alt_st_line3', 'ad_alt_st_line4', 
        'ad_alt_prcs_st_line1', 'ad_alt_prcs_st_line2', 'ad_alt_prcs_st_line3', 'ad_alt_prcs_st_line4', 
        'ad_alt_geo_coord_latd', 'ad_alt_geo_coord_longt', 

        'ad_temp_line_care', 'ad_temp_st_line1', 'ad_temp_st_line2', 'ad_temp_st_line3', 'ad_temp_st_line4', 
        'ad_temp_prcs_st_line1', 'ad_temp_prcs_st_line2', 'ad_temp_prcs_st_line3', 'ad_temp_prcs_st_line4', 
        'ad_temp_geo_coord_latd', 'ad_temp_geo_coord_longt', 

        'ad_other_line_care', 'ad_other_st_line1', 'ad_other_st_line2', 'ad_other_st_line3', 'ad_other_st_line4', 
        'ad_other_prcs_st_line1', 'ad_other_prcs_st_line2', 'ad_other_prcs_st_line3', 'ad_other_prcs_st_line4', 
        'ad_other_geo_coord_latd', 'ad_other_geo_coord_longt', 

        'ad_add',

 
- **Phone columns**:  
        'home_phone_no', 'home_prcs_phone_no', 'home_prcs_full_phone_no', 
        'alt_home_phone_no', 'alt_home_prcs_phone_no', 'alt_home_prcs_full_phone_no', 

        'bus_phone_no', 'bus_prcs_phone_no', 'bus_prcs_full_phone_no', 
        'alt_bus_phone_no', 'alt_bus_prcs_phone_no', 'alt_bus_prcs_full_phone_no', 

        'mob_phone_no', 'mob_prcs_phone_no', 'mob_prcs_full_phone_no', 
        'alt_mob_phone_no', 'alt_mob_prcs_phone_no', 'alt_mob_prcs_full_phone_no',

        'atty_phone_no', 'atty_prcs_phone_no', 'atty_prcs_full_phone_no', 

        'fax_no', 'prcs_fax_no', 'prcs_full_fax_no', 

        'phone_ani', 

        'other_phone_no', 'other_prcs_phone_no', 'other_prcs_full_phone_no', 

        'add_phone', 
  
- **Email columns**:  
        'srvc_email_ad', 'srvc_prcs_email_ad', 
        'estmt_email_ad', 'estmt_prcs_email_ad', 
        'other_email_ad'
               

<div class="alert alert-info" style="background-color:#006a79; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'>Preprocessing</h2>
</div>

**Processing <font color=red>Customer</font> entity**

- Leave **cust_id** for future use.

**Processing <font color=red>Account</font> entity**

In [14]:
cols_account = [
    'acct_id', 'cm11', 'cm13', 'cm15', 'alt_acct_id', 
    'parnt_acct_no', 'mbr_rwrd_id', 'setup_refer_acct_id',
    'bank_prim_acct_no', 'bank_prim_rte_no', 'bank_prim_iban_no',
    'bank_scnd_acct_no', 'bank_scnd_rte_no', 'bank_scnd_iban_no'
]
         
# dm_df[cols_account].head()

In [15]:
width_cm11 = int(dm_df['cm11'].str.encode(encoding='utf-8').str.len().max())
width_cm13 = int(dm_df['cm13'].str.encode(encoding='utf-8').str.len().max())
width_cm15 = int(dm_df['cm15'].str.encode(encoding='utf-8').str.len().max())
width_parnt_acct_no = int(dm_df['parnt_acct_no'].str.encode(encoding='utf-8').str.len().max())

# print(width_cm11)
# print(width_cm13)
# print(width_cm15)
# print(width_parnt_acct_no)

In [16]:
print('Is "acct_id" is a superset of "cm11 ?"', isSubset(dm_df, 'acct_id', 'cm11'))
print('Is "acct_id" is a superset of "cm13 ?"', isSubset(dm_df, 'acct_id', 'cm13'))
print('Is "acct_id" is a superset of "cm15 ?"', isSubset(dm_df, 'acct_id', 'cm15'))
print('Is "acct_id" is a superset of "parnt_acct_no ?"', isSubset(dm_df, 'acct_id', 'parnt_acct_no'))

Is "acct_id" is a superset of "cm11 ?" True
Is "acct_id" is a superset of "cm13 ?" True
Is "acct_id" is a superset of "cm15 ?" True
Is "acct_id" is a superset of "parnt_acct_no ?" True


In [17]:
print('Is "acct_id" is a superset of "parnt_acct_no ?"', isSubset(dm_df, 'acct_id', 'setup_refer_acct_id'))

Is "acct_id" is a superset of "parnt_acct_no ?" False


We notice that the columns viz., **cm11, cm13, cm15 and parnt_acct_no** are subsets of column **acct_id** and can be discarded from futher processing. We also discard **setup_refer_acct_id**.

- It is evident that columns cm11, cm13, cm15 are subsets of acct_id.
- setup_refer_acct_id is a combination of alt_acct_id & parnt_acct_no columns

so they can be removed. Similarly, a glance over other columns shows non demographic columns and so we discard them as well from our dataframe:

In [18]:
# Removing below columns including those that does not belong to demographic information
drop_cols_account = [
    'cm11', 'cm13', 'cm15', 'alt_acct_id', 
    'parnt_acct_no', 'mbr_rwrd_id', 'setup_refer_acct_id',
    'bank_prim_rte_no', 'bank_prim_iban_no',
    'bank_scnd_rte_no', 'bank_scnd_iban_no'
]

print('No. of columns before drop = ', dm_df.shape[1])
dm_df.drop(drop_cols_account, axis=1, inplace=True)
print('No. of columns after drop = ', dm_df.shape[1])

No. of columns before drop =  127
No. of columns after drop =  116


**Processing <font color=red>Name</font> entity**

In [19]:
cols_name = ['emb_indv_nm', 'emb_prcs_indv_nm', 
             'indv_prim_first_nm', 'indv_prim_mid_nm', 'indv_prim_lst_nm', 'indv_prim_full_nm', 
             'indv_prim_prcs_first_nm', 'indv_prim_prcs_mid_nm', 'indv_prim_prcs_lst_nm', 'indv_prim_prcs_full_nm',
             'indv_prim_ttl_nm', 'indv_prim_pfx_nm', 'indv_prim_suff_nm', 
             'indv_prim_add_lst_nm', 'indv_scnd_nm',
             'lgl_nm', 'lgl_prcs_nm']
dm_df[cols_name].head()

Unnamed: 0,emb_indv_nm,emb_prcs_indv_nm,indv_prim_first_nm,indv_prim_mid_nm,indv_prim_lst_nm,indv_prim_full_nm,indv_prim_prcs_first_nm,indv_prim_prcs_mid_nm,indv_prim_prcs_lst_nm,indv_prim_prcs_full_nm,indv_prim_ttl_nm,indv_prim_pfx_nm,indv_prim_suff_nm,indv_prim_add_lst_nm,indv_scnd_nm,lgl_nm,lgl_prcs_nm
0,,,MEGAN,,GREEN,MEGAN GREEN,MEGAN,,GREEN,MEGAN GREEN,,,,,,,
1,,,DAVID,,FLEENER,DAVID FLEENER,DAVID,,FLEENER,DAVID FLEENER,,,,,,,
2,LAURA PALLAS,LAURA PALLAS,LAURA,,PALLAS,LAURA PALLAS,LAURA,,PALLAS,LAURA PALLAS,,,,,,LOUD BOX ENT,LOUD BOX ENT
3,,,BERNADETTE,,REYNA,BERNADETTE REYNA,BERNADETTE,,REYNA,BERNADETTE REYNA,,,,,,,
4,NARINDER S SAWHNEY,NARINDER S SAWHNEY,NARINDER,S,SAWHNEY,NARINDER S SAWHNEY,NARINDER,S,SAWHNEY,NARINDER S SAWHNEY,,,,,,IMPORT BOUTIQUE,IMPORT BOUTIQUE


A glance over the names reveals following details:
- column **emb_prcs_indv_nm** is a processed form of the column **emb_indv_nm**
- column **indv_prim_full_nm** is a concatenated form of the columns **indv_prim_first_nm, indv_prim_mid_nm, indv_prim_lst_nm**
- column **indv_prim_prcs_full_nm** is a processed form of the column **indv_prim_full_nm**
- column **indv_prim_prcs_full_nm** is a concatenated form of the columns **indv_prim_prcs_first_nm, indv_prim_prcs_mid_nm,    indv_prim_prcs_lst_nm**
- column **lgl_prcs_nm** is a processed form of the column **lgl_nm**

Let us take following steps moving further:
- retain only the column **indv_prim_prcs_full_nm** and discard all other columns

In [20]:
drop_cols_name = [
    'emb_indv_nm', 'emb_prcs_indv_nm', 
    'indv_prim_first_nm', 'indv_prim_mid_nm', 'indv_prim_lst_nm', 'indv_prim_full_nm', 
    'indv_prim_prcs_first_nm', 'indv_prim_prcs_mid_nm', 'indv_prim_prcs_lst_nm', 
    'indv_prim_ttl_nm', 'indv_prim_pfx_nm', 'indv_prim_suff_nm', 
    'indv_prim_add_lst_nm', 'indv_scnd_nm',
    'lgl_nm', 'lgl_prcs_nm'
]

dm_df.drop(columns=drop_cols_name, axis=1, inplace=True)

**Processing <font color=red>Gender</font> entity**

In [21]:
dm_df['gend_cd'].value_counts()

UNKNOWN    7079
MALE       234 
FEMALE     99  
Name: gend_cd, dtype: int64

As we see there are several values of this column are **Unknown**. Though we can use **NLP techniques** to deduce gender code from the names, we revisit this column if need arise. For the time being we discard this column.

In [22]:
drop_cols_gender = ['gend_cd']

# Drop columns
dm_df.drop( columns=drop_cols_gender, axis=1, inplace=True)

**Processing <font color=red>Date of Birth</font> entity**

In [23]:
dm_df[['cust_dob', 'birth_yr', 'prcs_dob']].head()

Unnamed: 0,cust_dob,birth_yr,prcs_dob
0,1994-12-18,1994,1994-12-18
1,1972-08-25,1972,1972-08-25
2,1960-08-29,1960,1960-08-29
3,1994-07-09,1994,1994-07-09
4,1938-09-04,1938,1938-09-04


In [24]:
dm_df[['cust_dob', 'prcs_dob']].isnull().sum()

cust_dob    29 
prcs_dob    540
dtype: int64

In [25]:
dm_df['DateOfBirth'] = np.nan

cols_Dob = ['cust_dob', 'prcs_dob']

for col in cols_Dob:
    dm_df['DateOfBirth'].fillna(dm_df[col], inplace=True)

drop_cols_Dob = ['cust_dob', 'prcs_dob', 'birth_yr']
dm_df.drop(columns=drop_cols_Dob, axis=1, inplace=True)

**Processing <font color=red>Address</font> entity**

In [26]:
# List columns of interest
cols_address = [
    'ad_home_prcs_st_line1', 'ad_home_prcs_st_line2', 'ad_home_prcs_st_line3', 'ad_home_prcs_st_line4', 
    'ad_bus_prcs_st_line1', 'ad_bus_prcs_st_line2', 'ad_bus_prcs_st_line3', 'ad_bus_prcs_st_line4', 
    'ad_alt_prcs_st_line1', 'ad_alt_prcs_st_line2', 'ad_alt_prcs_st_line3', 'ad_alt_prcs_st_line4', 
    'ad_temp_prcs_st_line1', 'ad_temp_prcs_st_line2', 'ad_temp_prcs_st_line3', 'ad_temp_prcs_st_line4',
    'ad_other_prcs_st_line1', 'ad_other_prcs_st_line2', 'ad_other_prcs_st_line3', 'ad_other_prcs_st_line4'
]

cols_lattitude = [
    'ad_home_geo_coord_latd', 
    'ad_bus_geo_coord_latd', 
    'ad_alt_geo_coord_latd', 
    'ad_temp_geo_coord_latd', 
    'ad_other_geo_coord_latd'
    ]

cols_longitude = [
    'ad_home_geo_coord_longt', 
    'ad_bus_geo_coord_longt', 
    'ad_alt_geo_coord_longt', 
    'ad_temp_geo_coord_longt', 
    'ad_other_geo_coord_longt'
    ]

# Create new columns
dm_df['AddressLine'] = np.nan
dm_df['Lattitude'] = np.nan
dm_df['Longitude'] = np.nan

# Combine values into new columns
for address in cols_address:
    dm_df['AddressLine'].fillna(dm_df[address], inplace=True)

for latd in cols_lattitude:
    dm_df['Lattitude'].fillna(dm_df[latd], inplace=True)
    
for lngt in cols_longitude:
    dm_df['Longitude'].fillna(dm_df[lngt], inplace=True)
    

# List unused columns
drop_cols_address_unused = [
    'ad_home_line_care', 'ad_home_st_line1', 'ad_home_st_line2', 'ad_home_st_line3', 'ad_home_st_line4',
    'ad_bus_line_care', 'ad_bus_st_line1', 'ad_bus_st_line2', 'ad_bus_st_line3', 'ad_bus_st_line4',
    'ad_alt_line_care', 'ad_alt_st_line1', 'ad_alt_st_line2', 'ad_alt_st_line3', 'ad_alt_st_line4',
    'ad_temp_line_care', 'ad_temp_st_line1', 'ad_temp_st_line2', 'ad_temp_st_line3', 'ad_temp_st_line4',
    'ad_other_line_care', 'ad_other_st_line1', 'ad_other_st_line2', 'ad_other_st_line3', 'ad_other_st_line4',
    'ad_add' 
]

dm_df.drop(columns=cols_address, axis=1, inplace=True)
dm_df.drop(columns=cols_lattitude, axis=1, inplace=True)
dm_df.drop(columns=cols_longitude, axis=1, inplace=True)
dm_df.drop(columns=drop_cols_address_unused, axis=1, inplace=True)

**Processing <font color=red>Phone</font> entity**

In [27]:
# List columns of interest
cols_phone = [
    'home_phone_no', 'home_prcs_phone_no',
    'home_prcs_full_phone_no', 'alt_home_phone_no', 'alt_home_prcs_phone_no',
    'alt_home_prcs_full_phone_no', 'bus_phone_no', 'bus_prcs_phone_no',
    'bus_prcs_full_phone_no', 'alt_bus_phone_no', 'alt_bus_prcs_phone_no',
    'alt_bus_prcs_full_phone_no', 'mob_phone_no', 'mob_prcs_phone_no', 'mob_prcs_full_phone_no',
    'alt_mob_phone_no', 'alt_mob_prcs_phone_no', 'alt_mob_prcs_full_phone_no',
    'atty_phone_no', 'atty_prcs_phone_no', 'atty_prcs_full_phone_no', 'fax_no', 'prcs_fax_no',
    'prcs_full_fax_no', 'phone_ani', 'other_phone_no', 'other_prcs_phone_no',
    'other_prcs_full_phone_no', 'add_phone'
]

dm_df['CustomerPhone'] = np.nan

for phone in cols_phone:
    dm_df['CustomerPhone'].fillna(dm_df[phone], inplace=True)
    
print('Phone with nulls = ', dm_df['CustomerPhone'].isnull().sum())

dm_df.drop(cols_phone, axis=1, inplace=True)

Phone with nulls =  2575


**Processing <font color=red>Email</font> entity**

In [28]:
dm_df[['srvc_email_ad','srvc_prcs_email_ad','estmt_email_ad','estmt_prcs_email_ad','other_email_ad']].isnull().sum()

srvc_email_ad          5394
srvc_prcs_email_ad     5406
estmt_email_ad         5194
estmt_prcs_email_ad    5211
other_email_ad         7995
dtype: int64

In [29]:
# List columns of interest
cols_emails = ['srvc_email_ad','srvc_prcs_email_ad','estmt_email_ad','estmt_prcs_email_ad','other_email_ad']

dm_df['Email_ID'] = np.nan

for email in cols_emails:
    dm_df['Email_ID'].fillna(dm_df[email], inplace=True)
    
print('Null email values = ', dm_df['Email_ID'].isnull().sum())

dm_df.drop(cols_emails, axis=1, inplace=True)

Null email values =  4860


**Processing <font color=red>Government Document</font> (JSON column)**

In [30]:
# Processing gov_doc_id to fetch Social Security Numbers(SSNs):

df = pd.DataFrame(dm_df['gov_doc_id'])
df.head()

Unnamed: 0,gov_doc_id
0,"[{""id"":""271980164"",""typ"":""SSN"",""id_last4"":""0164"",""lst_updt_src"":""OTL_CRPS"",""lst_updt_ts"":""2020-01-21T00:00"",""processed"":{""id"":""271980164"",""std_ind"":""Y""}}]"
1,"[{""id"":""366061010"",""typ"":""SSN"",""id_last4"":""1010"",""lst_updt_src"":""OTL_CRPS"",""lst_updt_ts"":""2013-11-16T00:00"",""processed"":{""id"":""366061010"",""std_ind"":""Y""}}]"
2,"[{""lst_updt_src"":""OTL_CRPS_GNA"",""processed"":{""id"":""606947815"",""std_ind"":""E""},""typ"":""SSN"",""id"":""606947815"",""lst_updt_ts"":""2014-03-21T13:58:40.946763"",""id_last4"":""7815""}]"
3,"[{""id"":""601432001"",""typ"":""SSN"",""id_last4"":""2001"",""lst_updt_src"":""OTL_CRPS"",""lst_updt_ts"":""2016-11-01T00:00"",""processed"":{""id"":""601432001"",""std_ind"":""Y""}}]"
4,"[{""lst_updt_src"":""OTL_CRPS_GNA"",""processed"":{""typ"":""SSN"",""id"":""408983148"",""std_ind"":""Y""},""issue_ctry_cd"":""AS"",""typ"":""SSN"",""id"":""408983148"",""lst_updt_ts"":""2019-05-04T18:36:00.016348"",""id_last4"":""3148""}]"


In [31]:
# Create a column to hold SSNs to be fetched from gov_doc_id

df['ssn_id'] = ''
df.head()

Unnamed: 0,gov_doc_id,ssn_id
0,"[{""id"":""271980164"",""typ"":""SSN"",""id_last4"":""0164"",""lst_updt_src"":""OTL_CRPS"",""lst_updt_ts"":""2020-01-21T00:00"",""processed"":{""id"":""271980164"",""std_ind"":""Y""}}]",
1,"[{""id"":""366061010"",""typ"":""SSN"",""id_last4"":""1010"",""lst_updt_src"":""OTL_CRPS"",""lst_updt_ts"":""2013-11-16T00:00"",""processed"":{""id"":""366061010"",""std_ind"":""Y""}}]",
2,"[{""lst_updt_src"":""OTL_CRPS_GNA"",""processed"":{""id"":""606947815"",""std_ind"":""E""},""typ"":""SSN"",""id"":""606947815"",""lst_updt_ts"":""2014-03-21T13:58:40.946763"",""id_last4"":""7815""}]",
3,"[{""id"":""601432001"",""typ"":""SSN"",""id_last4"":""2001"",""lst_updt_src"":""OTL_CRPS"",""lst_updt_ts"":""2016-11-01T00:00"",""processed"":{""id"":""601432001"",""std_ind"":""Y""}}]",
4,"[{""lst_updt_src"":""OTL_CRPS_GNA"",""processed"":{""typ"":""SSN"",""id"":""408983148"",""std_ind"":""Y""},""issue_ctry_cd"":""AS"",""typ"":""SSN"",""id"":""408983148"",""lst_updt_ts"":""2019-05-04T18:36:00.016348"",""id_last4"":""3148""}]",


In [32]:
# Replace Null values of processed SSNs with the string "Blank Value"
df['gov_doc_id'].fillna('{"processed.id":"Blank Value"}', inplace=True)

In [33]:
df['gov_doc_id'] = df['gov_doc_id'].apply(clean_json)
df['gov_doc_id']

0       [{'id': '271980164', 'typ': 'SSN', 'id_last4': '0164', 'lst_updt_src': 'OTL_CRPS', 'lst_updt_ts': '2020-01-21T00:00', 'processed': {'id': '271980164', 'std_ind': 'Y'}}]                                                   
1       [{'id': '366061010', 'typ': 'SSN', 'id_last4': '1010', 'lst_updt_src': 'OTL_CRPS', 'lst_updt_ts': '2013-11-16T00:00', 'processed': {'id': '366061010', 'std_ind': 'Y'}}]                                                   
2       [{'lst_updt_src': 'OTL_CRPS_GNA', 'processed': {'id': '606947815', 'std_ind': 'E'}, 'typ': 'SSN', 'id': '606947815', 'lst_updt_ts': '2014-03-21T13:58:40.946763', 'id_last4': '7815'}]                                     
3       [{'id': '601432001', 'typ': 'SSN', 'id_last4': '2001', 'lst_updt_src': 'OTL_CRPS', 'lst_updt_ts': '2016-11-01T00:00', 'processed': {'id': '601432001', 'std_ind': 'Y'}}]                                                   
4       [{'lst_updt_src': 'OTL_CRPS_GNA', 'processed': {'typ': 'SSN', 'id': '408983148',

In [34]:
# Check how can we access SSN from the series

pd.json_normalize(df['gov_doc_id'][0])

Unnamed: 0,id,typ,id_last4,lst_updt_src,lst_updt_ts,processed.id,processed.std_ind
0,271980164,SSN,164,OTL_CRPS,2020-01-21T00:00,271980164,Y


In [35]:
pd.json_normalize(df['gov_doc_id'][0])['processed.id']

0    271980164
Name: processed.id, dtype: object

In [36]:
cnt = len(df['gov_doc_id'])
missing_ssd = False
for i in np.arange(cnt):
    try:
        a_row = pd.json_normalize(df['gov_doc_id'][i])['processed.id']
    except:
        missing_ssd = True
    if missing_ssd:
        df['ssn_id'].iloc[i] = 'Missing SSN'
    else:
        missing_ssd = False
        df['ssn_id'].iloc[i] = str(a_row[0])
    #print(i, a_row[0])

In [37]:
df

Unnamed: 0,gov_doc_id,ssn_id
0,"[{'id': '271980164', 'typ': 'SSN', 'id_last4': '0164', 'lst_updt_src': 'OTL_CRPS', 'lst_updt_ts': '2020-01-21T00:00', 'processed': {'id': '271980164', 'std_ind': 'Y'}}]",271980164
1,"[{'id': '366061010', 'typ': 'SSN', 'id_last4': '1010', 'lst_updt_src': 'OTL_CRPS', 'lst_updt_ts': '2013-11-16T00:00', 'processed': {'id': '366061010', 'std_ind': 'Y'}}]",366061010
2,"[{'lst_updt_src': 'OTL_CRPS_GNA', 'processed': {'id': '606947815', 'std_ind': 'E'}, 'typ': 'SSN', 'id': '606947815', 'lst_updt_ts': '2014-03-21T13:58:40.946763', 'id_last4': '7815'}]",606947815
3,"[{'id': '601432001', 'typ': 'SSN', 'id_last4': '2001', 'lst_updt_src': 'OTL_CRPS', 'lst_updt_ts': '2016-11-01T00:00', 'processed': {'id': '601432001', 'std_ind': 'Y'}}]",601432001
4,"[{'lst_updt_src': 'OTL_CRPS_GNA', 'processed': {'typ': 'SSN', 'id': '408983148', 'std_ind': 'Y'}, 'issue_ctry_cd': 'AS', 'typ': 'SSN', 'id': '408983148', 'lst_updt_ts': '2019-05-04T18:36:00.016348', 'id_last4': '3148'}]",408983148
...,...,...
8141,"[{'lst_updt_src': 'OTL_CRPS', 'processed': {'typ': 'SSN', 'id': '411732061', 'std_ind': 'Y'}, 'issue_ctry_cd': 'US', 'typ': 'SSN', 'id': '411732061', 'lst_updt_ts': '2017-05-11T15:19:19.137096', 'id_last4': '2061'}]",Missing SSN
8142,"[{'id': '261571890', 'typ': 'SSN', 'id_last4': '1890', 'lst_updt_src': 'OTL_CRPS', 'lst_updt_ts': '2019-04-24T00:00', 'processed': {'id': '261571890', 'std_ind': 'Y'}}]",Missing SSN
8143,"[{'id': '466630137', 'typ': 'SSN', 'id_last4': '0137', 'lst_updt_src': 'OTL_CRPS', 'lst_updt_ts': '2016-01-29T00:00', 'processed': {'id': '466630137', 'std_ind': 'Y'}}]",Missing SSN
8144,"[{'id': '442783462', 'typ': 'SSN', 'id_last4': '3462', 'lst_updt_src': 'OTL_CRPS', 'lst_updt_ts': '2016-01-29T00:00', 'processed': {'id': '442783462', 'std_ind': 'Y'}}]",Missing SSN


In [38]:
dm_df['ssn_id'] = df['ssn_id'].astype(str)
dm_df.head()

Unnamed: 0,cust_id,acct_id,indv_prim_prcs_full_nm,gov_doc_id,bank_prim_acct_no,bank_scnd_acct_no,DateOfBirth,AddressLine,Lattitude,Longitude,CustomerPhone,Email_ID,ssn_id
0,600125815019,25190699,MEGAN GREEN,"[{""id"":""271980164"",""typ"":""SSN"",""id_last4"":""0164"",""lst_updt_src"":""OTL_CRPS"",""lst_updt_ts"":""2020-01-21T00:00"",""processed"":{""id"":""271980164"",""std_ind"":""Y""}}]",,,1994-12-18,8413 E STREET,,,,,271980164
1,568709454017,975879,DAVID FLEENER,"[{""id"":""366061010"",""typ"":""SSN"",""id_last4"":""1010"",""lst_updt_src"":""OTL_CRPS"",""lst_updt_ts"":""2013-11-16T00:00"",""processed"":{""id"":""366061010"",""std_ind"":""Y""}}]",,,1972-08-25,10190 N 600 E,,,8123713733.0,,366061010
2,360534588017,3727271315801,LAURA PALLAS,"[{""lst_updt_src"":""OTL_CRPS_GNA"",""processed"":{""id"":""606947815"",""std_ind"":""E""},""typ"":""SSN"",""id"":""606947815"",""lst_updt_ts"":""2014-03-21T13:58:40.946763"",""id_last4"":""7815""}]",,,1960-08-29,11559 DONA TERESA DR,,,,,606947815
3,196410570010,14884816,BERNADETTE REYNA,"[{""id"":""601432001"",""typ"":""SSN"",""id_last4"":""2001"",""lst_updt_src"":""OTL_CRPS"",""lst_updt_ts"":""2016-11-01T00:00"",""processed"":{""id"":""601432001"",""std_ind"":""Y""}}]",,,1994-07-09,1010 S 1ST STREET,,,6234998313.0,,601432001
4,35442548013,3798238830600,NARINDER S SAWHNEY,"[{""lst_updt_src"":""OTL_CRPS_GNA"",""processed"":{""typ"":""SSN"",""id"":""408983148"",""std_ind"":""Y""},""issue_ctry_cd"":""AS"",""typ"":""SSN"",""id"":""408983148"",""lst_updt_ts"":""2019-05-04T18:36:00.016348"",""id_last4"":""3148""}]",,,1938-09-04,5601 CLOVERMEADE DR,,,6153731643.0,,408983148


In [39]:
dm_df.drop(['gov_doc_id'], axis=1, inplace=True)

In [40]:
dm_df.head()

Unnamed: 0,cust_id,acct_id,indv_prim_prcs_full_nm,bank_prim_acct_no,bank_scnd_acct_no,DateOfBirth,AddressLine,Lattitude,Longitude,CustomerPhone,Email_ID,ssn_id
0,600125815019,25190699,MEGAN GREEN,,,1994-12-18,8413 E STREET,,,,,271980164
1,568709454017,975879,DAVID FLEENER,,,1972-08-25,10190 N 600 E,,,8123713733.0,,366061010
2,360534588017,3727271315801,LAURA PALLAS,,,1960-08-29,11559 DONA TERESA DR,,,,,606947815
3,196410570010,14884816,BERNADETTE REYNA,,,1994-07-09,1010 S 1ST STREET,,,6234998313.0,,601432001
4,35442548013,3798238830600,NARINDER S SAWHNEY,,,1938-09-04,5601 CLOVERMEADE DR,,,6153731643.0,,408983148


In [41]:
tbl_results = getNullStats(dm_df)
tbl_results

Total Features(Columns) of dataset =  12
Null Features(Columns) of dataset =  9


Unnamed: 0,Nulls,Percent
bank_prim_acct_no,8146,100.0
bank_scnd_acct_no,8146,100.0
Lattitude,8146,100.0
Longitude,8146,100.0
Email_ID,4860,59.66
CustomerPhone,2575,31.61
AddressLine,230,2.82
DateOfBirth,29,0.36
indv_prim_prcs_full_nm,7,0.09


There are several columns with 100% nulls, which add no contribution to our findings. So let us discard all such columns.

In [42]:
threshold = 80

# Collect columns with 80 % null values
cols_insignificant = tbl_results[tbl_results['Percent'] >= threshold].index.to_list()

total_cols = len(dm_df.columns)
cnt_insignificant_cols = len(cols_insignificant)

print('There are', cnt_insignificant_cols, 'columns with null values out of ', total_cols)
print('We are left with ', total_cols - cnt_insignificant_cols, 'columns.\n')

# Dropping insignificant columns
dm_df.drop(cols_insignificant, axis=1, inplace=True)

print('We have removed following insignificant columns:')
for col in cols_insignificant:
    print(col)
    
print('\nWe are left with following columns:')
for col in dm_df.columns:
    print(col)

There are 4 columns with null values out of  12
We are left with  8 columns.

We have removed following insignificant columns:
bank_prim_acct_no
bank_scnd_acct_no
Lattitude
Longitude

We are left with following columns:
cust_id
acct_id
indv_prim_prcs_full_nm
DateOfBirth
AddressLine
CustomerPhone
Email_ID
ssn_id


In [43]:
dm_df.head()

Unnamed: 0,cust_id,acct_id,indv_prim_prcs_full_nm,DateOfBirth,AddressLine,CustomerPhone,Email_ID,ssn_id
0,600125815019,25190699,MEGAN GREEN,1994-12-18,8413 E STREET,,,271980164
1,568709454017,975879,DAVID FLEENER,1972-08-25,10190 N 600 E,8123713733.0,,366061010
2,360534588017,3727271315801,LAURA PALLAS,1960-08-29,11559 DONA TERESA DR,,,606947815
3,196410570010,14884816,BERNADETTE REYNA,1994-07-09,1010 S 1ST STREET,6234998313.0,,601432001
4,35442548013,3798238830600,NARINDER S SAWHNEY,1938-09-04,5601 CLOVERMEADE DR,6153731643.0,,408983148


In [44]:
dm_df.isnull().sum()

cust_id                   0   
acct_id                   0   
indv_prim_prcs_full_nm    7   
DateOfBirth               29  
AddressLine               230 
CustomerPhone             2575
Email_ID                  4860
ssn_id                    0   
dtype: int64

In [45]:
dm_df[dm_df.notnull()]

Unnamed: 0,cust_id,acct_id,indv_prim_prcs_full_nm,DateOfBirth,AddressLine,CustomerPhone,Email_ID,ssn_id
0,600125815019,25190699,MEGAN GREEN,1994-12-18,8413 E STREET,,,271980164
1,568709454017,975879,DAVID FLEENER,1972-08-25,10190 N 600 E,8123713733,,366061010
2,360534588017,3727271315801,LAURA PALLAS,1960-08-29,11559 DONA TERESA DR,,,606947815
3,196410570010,14884816,BERNADETTE REYNA,1994-07-09,1010 S 1ST STREET,6234998313,,601432001
4,35442548013,3798238830600,NARINDER S SAWHNEY,1938-09-04,5601 CLOVERMEADE DR,6153731643,,408983148
...,...,...,...,...,...,...,...,...
8141,855386850019,3712984222700,NINA APPAREDDY,1992-01-28,9219 ROYAL MOUNTAIN DR,4238555965,nina@ramtec.com,Missing SSN
8142,355336297011,23818556,SCOTT MALONE,1973-11-25,5320 FORT CAROLINE ROAD,,,Missing SSN
8143,634029714018,12536775,THOMAS MOORE,1983-08-08,6235 HIGHWAY 36,9798241205,,Missing SSN
8144,620277640017,12534539,AUSTIN CHASE,1971-02-19,41 PEABODY STREET,3109277361,,Missing SSN


In [46]:
dm_df.dropna(axis=0, inplace=True)

In [47]:
dm_df[dm_df.notnull()]

Unnamed: 0,cust_id,acct_id,indv_prim_prcs_full_nm,DateOfBirth,AddressLine,CustomerPhone,Email_ID,ssn_id
6,11351714014,3725517067300,ELIZABETH S HUTTEN,1964-09-30,30 TUCSON CIR,4156095922,bhutten7@gmail.com,353601592
8,269872999012,3725374672002,JIGAR R PATEL,1976-01-01,520 S EWING AVE,"[{""lst_updt_src"":""OTL_CRPS"",""processed"":{""ctry_cd"":""+1"",""full_nbr"":""+12143881014"",""nbr"":""2143881014"",""std_ind"":""Y""},""lst_updt_ts"":""2009-01-25T00:00"",""device_typ"":""LANDLINE"",""ctc_cd"":""V"",""nbr"":""2143881014""}]",jig143@yahoo.com,Blank Value
9,690307044012,1517919476,ROBERT P BROOKS,1942-04-02,28561 LA CUMBRE,9496062641,68brooks@gmail.com,060340881
12,19489139017,3782091487036,STEVE BAUERFEIND,1968-10-01,1923 N GRAMERCY PL,3104958286,swbauie@gmail.com,568921176
13,770974190017,3767869038700,TIMOTHY J HILGERT,1988-08-02,503 S HENRY ST,2296300267,timjhil@gmail.com,252674402
...,...,...,...,...,...,...,...,...
8134,528416283012,3792599838400,XIN FAN,1985-07-14,7455 BLYTHE PL,9519618529,fanxinsunny@gmail.com,Missing SSN
8136,538970600010,3796311016600,REBEKAH JANE CUNDARI,0001-01-01,257 CLINTON STREET 19C,2122963522,rebekah.cundari@morganstanley.com,Missing SSN
8137,736509012017,373794006545781,NEMIL VORA,1989-08-10,281 KIRK AVE,4088352179,nemil2k5@gmail.com,Missing SSN
8141,855386850019,3712984222700,NINA APPAREDDY,1992-01-28,9219 ROYAL MOUNTAIN DR,4238555965,nina@ramtec.com,Missing SSN


In [48]:
dm_df.isnull().sum()

cust_id                   0
acct_id                   0
indv_prim_prcs_full_nm    0
DateOfBirth               0
AddressLine               0
CustomerPhone             0
Email_ID                  0
ssn_id                    0
dtype: int64

In [49]:
len(dm_df)

2805

In [50]:
pd.set_option('display.max_rows', None)

In [51]:
cols_final = ['indv_prim_prcs_full_nm', 'DateOfBirth', 'acct_id', 'cust_id', 'ssn_id', 'AddressLine', 'CustomerPhone', 'Email_ID']
dm_df_sorted = dm_df[cols_final].sort_values(['indv_prim_prcs_full_nm', 'DateOfBirth', 'acct_id', 'cust_id'])
# dm_df_sorted

In [52]:
pd.set_option('display.max_rows', 10)

In [53]:
cntCustomers = len(dm_df_sorted['cust_id'].unique())
print('There are ', cntCustomers, 'customers in the current data set.')

There are  652 customers in the current data set.


<font color=red>**Study the issues in the existing data**</font>

1) Different customers having same name but 
- different customer ids
- different dates of birth, same year, same month

Question: 
- Can we safely treat as two different customers ?

Remarks:
- Should we weight features or consider equal probability of errors on all features and sum up to take further course of action

In [54]:
dm_df[dm_df['indv_prim_prcs_full_nm'] == 'alejandra p madrid'.upper()]

Unnamed: 0,cust_id,acct_id,indv_prim_prcs_full_nm,DateOfBirth,AddressLine,CustomerPhone,Email_ID,ssn_id
6935,52324565013,3717345006700,ALEJANDRA P MADRID,1975-04-19,13484 NW 13TH ST,9542966916,alespalding@gmail.com,Missing SSN
7013,402904311019,3739557393000,ALEJANDRA P MADRID,1975-04-01,11302 ROUNDELAY RD,9544415961,cargoconnectionc@bellsouth.net,Missing SSN


2) Same customer having 
- different parts of names
- different dates of birth

Remarks:
- We might have to take 
  - elementary form of name (last name, first name, middle name)
  - split date of birth to date, month, year
 
 We might end up having lots of features.

In [55]:
dm_df[dm_df['indv_prim_prcs_full_nm'].isin(['ABBY M PLOTKA', 'ABBY PLOTKA'])]

Unnamed: 0,cust_id,acct_id,indv_prim_prcs_full_nm,DateOfBirth,AddressLine,CustomerPhone,Email_ID,ssn_id
1431,27819937015,3713031691300,ABBY PLOTKA,1958-01-31,369 HERITAGE HLS,9146698189,amazing2@optonline.net,Missing SSN
1518,27819937015,3798011106300,ABBY PLOTKA,1958-01-31,369 HERITAGE HLS,9172821150,amazing2@optonline.net,Missing SSN
2389,27819937015,3720338258600,ABBY M PLOTKA,0001-01-01,369 HERITAGE HLS UNIT D,9146698189,amazing2@optonline.net,Missing SSN
3757,27819937015,3728224493701,ABBY PLOTKA,1958-01-31,369 HERITAGE HLS UNIT D,"[{""nbr"":""9146698189"",""ctc_cd"":""V"",""device_typ"":""LANDLINE"",""lst_updt_src"":""OTL_CRPS"",""lst_updt_ts"":""2010-04-04T00:00"",""processed"":{""nbr"":""9146698189"",""ctry_cd"":""+1"",""full_nbr"":""+19146698189"",""std_ind"":""Y""}}]",amazing10@optonline.com,Missing SSN
7190,27819937015,3791416090600,ABBY M PLOTKA,1958-01-31,369 HERITAGE HLS,9172821150,amazing2@optonline.net,Missing SSN
7617,27819937015,3792795329700,ABBY PLOTKA,1958-01-31,369 HERITAGE HLS,9172821150,amazing2@optonline.net,Missing SSN


3) 

..

4)

..

<div class="alert alert-info" style="background-color:#006a79; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'>Clustering</h2>
</div>

In [56]:
nRows = len(dm_df_sorted)
nRows = 250

df_dm_partial = dm_df_sorted[:nRows]
df_dm_partial['acct_id'].astype(str)

df_dm_partial['customer'] = df_dm_partial['indv_prim_prcs_full_nm']
df_dm_partial.head()

Unnamed: 0,indv_prim_prcs_full_nm,DateOfBirth,acct_id,cust_id,ssn_id,AddressLine,CustomerPhone,Email_ID,customer
2389,ABBY M PLOTKA,0001-01-01,3720338258600,27819937015,Missing SSN,369 HERITAGE HLS UNIT D,9146698189,amazing2@optonline.net,ABBY M PLOTKA
7190,ABBY M PLOTKA,1958-01-31,3791416090600,27819937015,Missing SSN,369 HERITAGE HLS,9172821150,amazing2@optonline.net,ABBY M PLOTKA
1431,ABBY PLOTKA,1958-01-31,3713031691300,27819937015,Missing SSN,369 HERITAGE HLS,9146698189,amazing2@optonline.net,ABBY PLOTKA
3757,ABBY PLOTKA,1958-01-31,3728224493701,27819937015,Missing SSN,369 HERITAGE HLS UNIT D,"[{""nbr"":""9146698189"",""ctc_cd"":""V"",""device_typ"":""LANDLINE"",""lst_updt_src"":""OTL_CRPS"",""lst_updt_ts"":""2010-04-04T00:00"",""processed"":{""nbr"":""9146698189"",""ctry_cd"":""+1"",""full_nbr"":""+19146698189"",""std_ind"":""Y""}}]",amazing10@optonline.com,ABBY PLOTKA
7617,ABBY PLOTKA,1958-01-31,3792795329700,27819937015,Missing SSN,369 HERITAGE HLS,9172821150,amazing2@optonline.net,ABBY PLOTKA


In [57]:
# Show accounts of each customer, graphically

# fig, ax = plt.subplots(figsize=(20, 30))
# x_tick_angle = 90
# plt.xticks(rotation=x_tick_angle)

# sns.scatterplot(data=df_dm_partial, x="indv_prim_prcs_full_nm", y=df_dm_partial['acct_id'].astype(str))

In [58]:
df_dm_partial['indv_prim_prcs_full_nm'] = df_dm_partial.loc[:, 'indv_prim_prcs_full_nm'].str.lower()
df_dm_partial['cust_id'] = df_dm_partial.loc[:, 'cust_id'].astype('str')

In [59]:
df_dm_partial[['indv_prim_prcs_full_nm', 'cust_id', 'acct_id', 'DateOfBirth']].head()

Unnamed: 0,indv_prim_prcs_full_nm,cust_id,acct_id,DateOfBirth
2389,abby m plotka,27819937015,3720338258600,0001-01-01
7190,abby m plotka,27819937015,3791416090600,1958-01-31
1431,abby plotka,27819937015,3713031691300,1958-01-31
3757,abby plotka,27819937015,3728224493701,1958-01-31
7617,abby plotka,27819937015,3792795329700,1958-01-31


<div class="alert alert-info" style="background-color:#006a79; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'>Vectorization</h2>
</div>

**Vectorizing <font color=red>Name</font>**

In [60]:
# Find maximum words in the column 'indv_prim_prcs_full_nm'
max_words_name = np.max([len(x.split()) for x in df_dm_partial['indv_prim_prcs_full_nm'].tolist()])

tfidf_Name_vector = TfidfVectorizer(max_df=0.999999999, 
                                   max_features=200000,
                                   min_df=0.000000001, 
                                   stop_words='english',
                                   use_idf=False, 
                                   tokenizer=getAllPermutations, 
                                   ngram_range=(1, max_words_name))

%time tfidf_name_matrix = tfidf_Name_vector.fit_transform(df_dm_partial['indv_prim_prcs_full_nm'])

print(tfidf_name_matrix.shape)

Wall time: 230 ms
(250, 870)


In [61]:
tfidf_name_matrix

<250x870 sparse matrix of type '<class 'numpy.float64'>'
	with 2616 stored elements in Compressed Sparse Row format>

In [62]:
features = tfidf_Name_vector.get_feature_names()
print(len(features))
# features

870


In [63]:
corpus = [n for n in df_dm_partial['indv_prim_prcs_full_nm']]
print(len(corpus))
# corpus

250


In [64]:
df_name = pd.DataFrame(tfidf_name_matrix.todense(), index=corpus, columns=features)
# df_name.head()

**Vectorizing <font color=red>Date of Birth</font>**

In [65]:
tfidf_Dob_vector = TfidfVectorizer(use_idf=False)

%time tfidf_Dob_matrix = tfidf_Dob_vector.fit_transform(df_dm_partial['DateOfBirth'])

print(tfidf_Dob_matrix.shape)
tfidf_Dob_matrix

Wall time: 4 ms
(250, 64)


<250x64 sparse matrix of type '<class 'numpy.float64'>'
	with 725 stored elements in Compressed Sparse Row format>

In [66]:
features = tfidf_Dob_vector.get_feature_names()
print(len(features))
# features

64


In [67]:
corpus = [n for n in df_dm_partial['DateOfBirth']]
print(len(corpus))
# corpus

250


In [68]:
df_Dob = pd.DataFrame(tfidf_Dob_matrix.todense(), index=corpus, columns=features)
# df_Dob.head()

**Vectorizing <font color=red>SSN</font>** <font color=red>(See the behavior by not considering it)</font>

**Vectorizing <font color=red>Customer Phone</font>**

In [69]:
tfidf_Phone_vector = TfidfVectorizer(use_idf=False)

%time tfidf_Phone_matrix = tfidf_Phone_vector.fit_transform(df_dm_partial['CustomerPhone'])

print(tfidf_Phone_matrix.shape)
tfidf_Phone_matrix

Wall time: 5 ms
(250, 143)


<250x143 sparse matrix of type '<class 'numpy.float64'>'
	with 405 stored elements in Compressed Sparse Row format>

In [70]:
features = tfidf_Phone_vector.get_feature_names()
print(len(features))
# features

143


In [71]:
corpus = [n for n in df_dm_partial['CustomerPhone']]
print(len(corpus))
# corpus

250


In [72]:
df_Phone = pd.DataFrame(tfidf_Phone_matrix.todense(), index=corpus, columns=features)
# df_Phone.head()

**Vectorizing <font color=red>Address</font>**

In [73]:
tfidf_Address_vector = TfidfVectorizer(use_idf=False)

%time tfidf_Address_matrix = tfidf_Address_vector.fit_transform(df_dm_partial['AddressLine'])

print(tfidf_Address_matrix.shape)
tfidf_Address_matrix

Wall time: 6 ms
(250, 218)


<250x218 sparse matrix of type '<class 'numpy.float64'>'
	with 836 stored elements in Compressed Sparse Row format>

In [74]:
features = tfidf_Address_vector.get_feature_names()
print(len(features))
# features

218


In [75]:
corpus = [n for n in df_dm_partial['AddressLine']]
print(len(corpus))
# corpus

250


In [76]:
df_Address = pd.DataFrame(tfidf_Address_matrix.todense(), index=corpus, columns=features)
# df_Address.head()

**Vectorizing <font color=red>Email ID</font>**

In [77]:
tfidf_Email_vector = TfidfVectorizer(use_idf=False)

%time tfidf_Email_matrix = tfidf_Email_vector.fit_transform(df_dm_partial['Email_ID'])

print(tfidf_Email_matrix.shape)
tfidf_Email_matrix

Wall time: 5 ms
(250, 141)


<250x141 sparse matrix of type '<class 'numpy.float64'>'
	with 773 stored elements in Compressed Sparse Row format>

In [78]:
features = tfidf_Email_vector.get_feature_names()
print(len(features))
# features

141


In [79]:
corpus = [n for n in df_dm_partial['Email_ID']]
print(len(corpus))
# corpus

250


In [80]:
df_Email = pd.DataFrame(tfidf_Email_matrix.todense(), index=corpus, columns=features)
# df_Email.head()

<div class="alert alert-info" style="background-color:#006a79; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'>Start Clustering (K-means)</h2>
</div>

**Clustering <font color=red>Customer Name</font>**

In [81]:
# Apply k-means clustering on name

num_clusters = len(df_dm_partial['cust_id'].unique())
km_name = kMeansClustering(tfidf_name_matrix, num_clusters)
clusters_name = km_name.labels_.tolist()

Wall time: 1.02 s


In [82]:
print(len(clusters_name))
# clusters_name

250


**Clustering <font color=red>Date of Birth</font>**

In [83]:
# Apply k-means clustering on Date of Birth

num_clusters = len(df_dm_partial['cust_id'].unique())
km_Dob = kMeansClustering(tfidf_Dob_matrix, num_clusters)
clusters_Dob = km_Dob.labels_.tolist()

Wall time: 811 ms


In [84]:
print(len(clusters_Dob))
# clusters_Dob

250


**Clustering <font color=red>SSN</font>**

<font color=red>(Excluded SSN as it is spoiling instead of helping)</font>

**Clustering <font color=red>Phone</font>**

In [85]:
# Apply k-means clustering on Customer Phone

num_clusters = len(df_dm_partial['cust_id'].unique())
km_Phone = kMeansClustering(tfidf_Phone_matrix, num_clusters)
clusters_Phone = km_Phone.labels_.tolist()

Wall time: 799 ms


In [86]:
print(len(clusters_Phone))
# clusters_Phone

250


**Clustering <font color=red>Address</font>**

In [87]:
# Apply k-means clustering on Address

num_clusters = len(df_dm_partial['cust_id'].unique())
km_Address = kMeansClustering(tfidf_Address_matrix, num_clusters)
clusters_Address = km_Address.labels_.tolist()

Wall time: 840 ms


In [88]:
print(len(clusters_Address))
# clusters_Addr

250


**Clustering <font color=red>Email ID</font>**

In [89]:
# Apply k-means clustering on Email ID

num_clusters = len(df_dm_partial['cust_id'].unique())
km_Email = kMeansClustering(tfidf_Email_matrix, num_clusters)
clusters_Email = km_Email.labels_.tolist()

Wall time: 816 ms


In [90]:
print(len(clusters_Email))
# clusters_Email

250


**Clustering <font color=red>Clusters</font>**

(All the independent features viz., Name, Date of birth, Phone, Address, Email ID refer to identify customer uniquely)

In [91]:
num_clusters = len(df_dm_partial['cust_id'].unique())
ndigits = len(str(num_clusters+1))
print('Digits in maximum clusters formed =>', ndigits)

Digits in maximum clusters formed => 2


In [92]:
df_dm_partial['cluster by name']    = list(map(lambda x: str(x).zfill(ndigits), clusters_name))
df_dm_partial['cluster by Dob']     = list(map(lambda x: str(x).zfill(ndigits), clusters_Dob))
#df_dm_partial['cluster by SSN']     = list(map(lambda x: str(x).zfill(ndigits), clusters_SSN))
df_dm_partial['cluster by Phone']   = list(map(lambda x: str(x).zfill(ndigits), clusters_Phone))
df_dm_partial['cluster by Address'] = list(map(lambda x: str(x).zfill(ndigits), clusters_Address))
df_dm_partial['cluster by Email']   = list(map(lambda x: str(x).zfill(ndigits), clusters_Email))

In [93]:
df_dm_partial['cluster by clusters'] = ('N' + df_dm_partial['cluster by name'].astype(str) 
                                + ' ' + 'D' + df_dm_partial['cluster by Dob'].astype(str)
#                                + ' ' + df_dm_partial['cluster by SSN'].astype(str)
                                + ' ' + 'P' + df_dm_partial['cluster by Phone'].astype(str)
                                + ' ' + 'A' + df_dm_partial['cluster by Address'].astype(str)
                                + ' ' + 'E' + df_dm_partial['cluster by Email'].astype(str))

In [94]:
df_dm_partial.head()

Unnamed: 0,indv_prim_prcs_full_nm,DateOfBirth,acct_id,cust_id,ssn_id,AddressLine,CustomerPhone,Email_ID,customer,cluster by name,cluster by Dob,cluster by Phone,cluster by Address,cluster by Email,cluster by clusters
2389,abby m plotka,0001-01-01,3720338258600,27819937015,Missing SSN,369 HERITAGE HLS UNIT D,9146698189,amazing2@optonline.net,ABBY M PLOTKA,14,25,47,6,8,N14 D25 P47 A06 E08
7190,abby m plotka,1958-01-31,3791416090600,27819937015,Missing SSN,369 HERITAGE HLS,9172821150,amazing2@optonline.net,ABBY M PLOTKA,14,27,25,6,8,N14 D27 P25 A06 E08
1431,abby plotka,1958-01-31,3713031691300,27819937015,Missing SSN,369 HERITAGE HLS,9146698189,amazing2@optonline.net,ABBY PLOTKA,14,27,47,6,8,N14 D27 P47 A06 E08
3757,abby plotka,1958-01-31,3728224493701,27819937015,Missing SSN,369 HERITAGE HLS UNIT D,"[{""nbr"":""9146698189"",""ctc_cd"":""V"",""device_typ"":""LANDLINE"",""lst_updt_src"":""OTL_CRPS"",""lst_updt_ts"":""2010-04-04T00:00"",""processed"":{""nbr"":""9146698189"",""ctry_cd"":""+1"",""full_nbr"":""+19146698189"",""std_ind"":""Y""}}]",amazing10@optonline.com,ABBY PLOTKA,14,27,14,6,58,N14 D27 P14 A06 E58
7617,abby plotka,1958-01-31,3792795329700,27819937015,Missing SSN,369 HERITAGE HLS,9172821150,amazing2@optonline.net,ABBY PLOTKA,14,27,25,6,8,N14 D27 P25 A06 E08


**Vectorizing <font color=red>Individual Clusters</font>**

In [95]:
tfidf_IndCluster_vector = TfidfVectorizer(max_df=0.999999999, 
                                           max_features=200000,
                                           min_df=0.000000001, 
                                           use_idf=True) 


%time tfidf_IndCluster_matrix = tfidf_IndCluster_vector.fit_transform(df_dm_partial['cluster by clusters'])

print(tfidf_IndCluster_matrix.shape)
tfidf_IndCluster_matrix

Wall time: 8 ms
(250, 335)


<250x335 sparse matrix of type '<class 'numpy.float64'>'
	with 1250 stored elements in Compressed Sparse Row format>

In [96]:
features = tfidf_IndCluster_vector.get_feature_names()
# print(len(features))
# features

In [97]:
corpus = [n for n in df_dm_partial['cluster by clusters']]
# print(len(corpus))
# corpus

In [98]:
df_IndCluster = pd.DataFrame(tfidf_IndCluster_matrix.todense(), index=corpus, columns=features)

**Clustering <font color=red> clusters</font>** 

(Clustering clusters formed by independent features)

In [99]:
# Apply k-means clustering on 'Cluster by clusters'

num_clusters = len(df_dm_partial['cust_id'].unique())
km_IndCluster = kMeansClustering(tfidf_IndCluster_matrix, num_clusters)
clusters_IndCluster = km_IndCluster.labels_.tolist()

Wall time: 774 ms


In [100]:
print(len(clusters_IndCluster))
# clusters_IndCluster

250


<div class="alert alert-info" style="background-color:#006a79; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'>Show Final results</h2>
</div>

In [101]:
# km_name.labels_

In [102]:
customers = df_dm_partial.loc[:, 'customer'].str.lower()[:nRows].tolist()
print(len(customers))
# customers

250


In [103]:
dobs = df_dm_partial.loc[:, 'DateOfBirth'].str[:].tolist()
print(len(dobs))
# dobs

250


In [104]:
cust_ids = df_dm_partial.loc[:, 'cust_id'].str[:].tolist()
print(len(cust_ids))
# cust_ids

250


In [105]:
# SSNs = df_dm_partial.loc[:, 'ssn_id'].str[:].tolist()
# print(len(SSNs))
# # SSNs

In [106]:
# Phones = df_dm_partial.loc[:, 'CustomerPhone'].str[:].tolist()
# print(len(Phones))
# # Phones

In [107]:
# Addresses = df_dm_partial.loc[:, 'AddressLine'].str[:].tolist()
# print(len(Addresses))
# # Addresses

In [108]:
# Emails = df_dm_partial.loc[:, 'Email_ID'].str[:].tolist()
# print(len(Emails))
# # Emails

In [109]:
import pandas as pd

amex_dict = {'customer': customers, 
             'birth date' : dobs, 
             'cust ID' : cust_ids, 
             'cluster by name' : clusters_name, 
             'cluster by Dob' : clusters_Dob, 
#             'cluster by SSN' : clusters_SSN, 
             'cluster by Phone' : clusters_Phone, 
             'cluster by Address' : clusters_Address, 
             'cluster by Email' : clusters_Email,
             'cluster by Clusters' : clusters_IndCluster}

amex_frame = pd.DataFrame(amex_dict, 
                          index=[clusters_name], 
                          columns = ['customer', 'birth date', 'cust ID', 'cluster by name', 
#                                     'cluster by Dob', 'cluster by SSN', 'cluster by Phone', 
                                     'cluster by Dob', 'cluster by Phone', 
                                     'cluster by Address', 'cluster by Email', 'cluster by Clusters'])

In [110]:
amex_frame.to_csv('Data/clusters_name_dob.csv', index=True)

In [111]:
pd.set_option('display.max_rows', None)
amex_frame.sort_values(by=['cluster by Clusters']);

**Present results in color for easy glance**

In [112]:
amx = amex_frame.sort_values(by=['cust ID'])

In [113]:
amx.reset_index(drop=True, inplace=True)
# amx

In [114]:
def highlight_alternate_rows(s, column):
    is_max = pd.Series(data=False, index=s.index)
    is_max[column] = s.loc[column] % 2
    return ['color: red' if is_max.any() else 'color: blue' for v in is_max]

In [134]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
amx.style.apply(highlight_alternate_rows, column='cluster by Clusters', axis=1)

Unnamed: 0,customer,birth date,cust ID,cluster by name,cluster by Dob,cluster by Phone,cluster by Address,cluster by Email,cluster by Clusters,cust_ID_Freq
0,ashley erekson,1985-06-04,116219624012,6,29,15,17,14,8,6
1,ashley erekson,1985-06-04,116219624012,6,29,15,17,14,8,6
2,ashley erekson,1985-06-04,116219624012,6,29,15,17,14,8,6
3,ashley erekson,1985-06-04,116219624012,6,29,15,17,14,8,6
4,ashley erekson,1985-06-04,116219624012,6,29,15,17,14,8,6
5,ashley erekson,1985-06-04,116219624012,6,29,15,17,14,8,6
6,alisa m mertens,1969-05-11,13737615019,22,35,0,33,16,33,4
7,alisa m mertens,1969-05-11,13737615019,22,35,24,33,16,33,4
8,alisa m mertens,1969-05-11,13737615019,22,35,24,33,16,33,4
9,alisa m mertens,1969-05-11,13737615019,22,35,24,33,16,33,4


In [116]:
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

**Trials**

In [117]:
amx.head(2)

Unnamed: 0,customer,birth date,cust ID,cluster by name,cluster by Dob,cluster by Phone,cluster by Address,cluster by Email,cluster by Clusters
0,ashley erekson,1985-06-04,116219624012,6,29,15,17,14,8
1,ashley erekson,1985-06-04,116219624012,6,29,15,17,14,8


In [118]:
amx['cust_ID_Freq'] = amx.groupby(['cluster by Clusters'])['cust ID'].transform('count')
print(amx.shape)
amx.head(2)

(250, 10)


Unnamed: 0,customer,birth date,cust ID,cluster by name,cluster by Dob,cluster by Phone,cluster by Address,cluster by Email,cluster by Clusters,cust_ID_Freq
0,ashley erekson,1985-06-04,116219624012,6,29,15,17,14,8,6
1,ashley erekson,1985-06-04,116219624012,6,29,15,17,14,8,6


In [119]:
# More than 2 cust IDs
amx_filtered = amx[amx.cust_ID_Freq > 1]
print(amx_filtered.shape)
amx_filtered

(242, 10)


Unnamed: 0,customer,birth date,cust ID,cluster by name,cluster by Dob,cluster by Phone,cluster by Address,cluster by Email,cluster by Clusters,cust_ID_Freq
0,ashley erekson,1985-06-04,116219624012,6,29,15,17,14,8,6
1,ashley erekson,1985-06-04,116219624012,6,29,15,17,14,8,6
2,ashley erekson,1985-06-04,116219624012,6,29,15,17,14,8,6
3,ashley erekson,1985-06-04,116219624012,6,29,15,17,14,8,6
4,ashley erekson,1985-06-04,116219624012,6,29,15,17,14,8,6
...,...,...,...,...,...,...,...,...,...,...
245,allan e groves,1936-05-26,9489405016,13,15,6,21,1,9,7
246,allan e groves,1936-05-26,9489405016,13,15,6,21,1,9,7
247,allan e groves,1936-05-26,9489405016,13,15,6,21,1,9,7
248,allan e groves,1936-05-26,9489405016,13,15,6,21,1,9,7


In [120]:
lstCusts = amx['cust ID'].unique().tolist()
lstCusts

['116219624012',
 '13737615019',
 '15232249019',
 '18971969011',
 '191283818016',
 '193446566014',
 '21632248014',
 '223749484014',
 '234382901013',
 '247887540010',
 '25104544016',
 '261478406016',
 '26489974019',
 '265156086015',
 '27819937015',
 '29058486010',
 '302655079011',
 '320636133017',
 '324559260019',
 '356931433010',
 '362924850012',
 '36710041004',
 '367971140018',
 '38925197016',
 '394037347012',
 '402904311019',
 '446469999015',
 '458545557012',
 '48185995018',
 '482928325011',
 '483207824408',
 '489166490011',
 '491328698019',
 '494758663010',
 '503960255019',
 '507873866016',
 '51613369015',
 '52324565013',
 '541684706016',
 '552323327011',
 '55349050016',
 '567508110403',
 '568791532016',
 '57437808011',
 '586941572018',
 '5893619019',
 '591736034018',
 '599330325018',
 '623076029015',
 '64665135015',
 '677273431012',
 '70450164019',
 '707496585017',
 '711011706017',
 '732842830015',
 '741000289012',
 '741145657016',
 '764014995015',
 '779688036015',
 '791511930010',

In [121]:
amx_final = pd.DataFrame(columns = ['customer', 'cust ID', 'cluster'])
amx_final

Unnamed: 0,customer,cust ID,cluster


In [122]:
# for cust in lstCusts:
#     row = amx_filtered[amx_filtered['cust ID'] == str(cust)][:1:]
#     print('Cluster =>', row['cluster by Clusters'].values[0])
#     print('\tCustomer =>', row['customer'].values[0])
#     print('\tCustomer ID =>', row['cust ID'].values[0])
    
#     amx_final.append({'customer': str(row['customer'].values[0]), 
#                       'cust ID': str(row['cust ID'].values[0]),
#                       'cluster by Clusters': str(row['cluster by Clusters'].values[0]) 
#                       }, ignore_index=True)
    


In [123]:
pd.set_option('display.max_rows', None)
grouped = amx.groupby(['cluster by Clusters'])
grp_one = grouped['customer', 'cust ID', 'cluster by Clusters'].head(2)
grp_one

Unnamed: 0,customer,cust ID,cluster by Clusters
0,ashley erekson,116219624012,8
1,ashley erekson,116219624012,8
6,alisa m mertens,13737615019,33
7,alisa m mertens,13737615019,33
10,brian trainor,15232249019,53
11,brian f trainor,15232249019,53
13,bruce d cohen,18971969011,15
14,bruce cohen,18971969011,15
18,alan hirsch,191283818016,28
19,alan hirsch,191283818016,28


In [124]:
pd.set_option('display.max_rows', 10)

**Get clusters having <font color=red>multiple cust_ids</font>**

In [125]:
grp = amex_frame.groupby(['cluster by Clusters', 'cust ID'])
dict_keys = grp.groups.keys()
tmpdf = pd.DataFrame(dict_keys, columns=['cluster by Clusters', 'cust_id'])
tmpdf

Unnamed: 0,cluster by Clusters,cust_id
0,0,503960255019
1,0,591736034018
2,0,732842830015
3,1,741145657016
4,2,38925197016
...,...,...
72,62,362924850012
73,63,193446566014
74,64,320636133017
75,65,402904311019


In [126]:
# Get clusters having 2 or more customers

results = tmpdf.groupby(['cluster by Clusters']).size()
results[results > 1]

cluster by Clusters
0     3
2     3
17    3
24    2
43    2
61    2
62    2
dtype: int64

In [127]:
print('customers having more than one customer ID assocations => ', len(results[results > 1]), '!')

customers having more than one customer ID assocations =>  7 !


In [128]:
# Show clusters having 2 or more customers with cust_id

lstCusts = results[results > 1].index
lstCusts
#tmpdf[tmpdf['cluster by Clusters'].isin(results[results > 1].index)]

Int64Index([0, 2, 17, 24, 43, 61, 62], dtype='int64', name='cluster by Clusters')

In [129]:
amx.head(2)

Unnamed: 0,customer,birth date,cust ID,cluster by name,cluster by Dob,cluster by Phone,cluster by Address,cluster by Email,cluster by Clusters,cust_ID_Freq
0,ashley erekson,1985-06-04,116219624012,6,29,15,17,14,8,6
1,ashley erekson,1985-06-04,116219624012,6,29,15,17,14,8,6


In [130]:
amx_morethan2 = amx.iloc[np.where(amx['cluster by Clusters'].isin(lstCusts))][['cluster by Clusters', 'customer', 'birth date', 'cust ID']]
amx_morethan2

Unnamed: 0,cluster by Clusters,customer,birth date,cust ID
31,62,alena white,1985-10-17,234382901013
32,17,aleksandr litovskiy,1985-04-27,247887540010
79,62,brett johnson,0001-01-01,362924850012
84,43,belinda martin,1956-02-27,367971140018
87,2,brad schneider,0001-01-01,38925197016
...,...,...,...,...
169,2,alec hosn,0001-01-01,5893619019
172,0,art borja,1983-08-25,591736034018
206,0,art borja,1983-08-25,732842830015
207,61,ashley m campbell,1988-01-02,741000289012


In [131]:
amx_morethan2[amx_morethan2['cust ID'].isin (amx_morethan2['cust ID'].unique())]

Unnamed: 0,cluster by Clusters,customer,birth date,cust ID
31,62,alena white,1985-10-17,234382901013
32,17,aleksandr litovskiy,1985-04-27,247887540010
79,62,brett johnson,0001-01-01,362924850012
84,43,belinda martin,1956-02-27,367971140018
87,2,brad schneider,0001-01-01,38925197016
...,...,...,...,...
169,2,alec hosn,0001-01-01,5893619019
172,0,art borja,1983-08-25,591736034018
206,0,art borja,1983-08-25,732842830015
207,61,ashley m campbell,1988-01-02,741000289012


In [132]:
amx.head(2)

Unnamed: 0,customer,birth date,cust ID,cluster by name,cluster by Dob,cluster by Phone,cluster by Address,cluster by Email,cluster by Clusters,cust_ID_Freq
0,ashley erekson,1985-06-04,116219624012,6,29,15,17,14,8,6
1,ashley erekson,1985-06-04,116219624012,6,29,15,17,14,8,6
