## Life Customer Segmentation
## Data used for analysis is based on Integral Life Admin System

### The SQL query in below cells will retreive data based on database credentials provide. Right now these credentials have been left empty for security reasons. Please enter relevant details before running this notebook further.

In [246]:
# Import Packages
import numpy as np

import pandas as pd
import pyodbc 

# Settings to view all columns and rows
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

## Import Data

In [101]:
# Enter ip address and port number of the system where the database resides.
server = '172.31.20.94'
database = 'INT77DB2014_LIF' 
username = 'sisensedb_user'
password = 'Sisense12#$' 

In [102]:
# Add appropriate driver name
cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
cursor = cnxn.cursor()

In [103]:
# Run Query
query = '''
select 
q1.clntnum as 'Client Number',
q1.clttype as 'Client Type' ,
q1.cltsex  as 'Client Sex' ,
q1.cltdob  as 'Client Date of Birth',
q1.marryd  as 'Client Marital Status',
q1.[client name] as 'Client Name'
from

(select clntnum, clttype, ( rtrim(givname) + ' ' + rtrim(surname) ) as 'client name', cltsex, cltdob, marryd from vm1dta.clntpf where validflag = '1')q1
order by q1.clntnum
'''

In [104]:
# Read data
df_source = pd.read_sql(query,cnxn)

# Close the cursor
cursor.close() 

In [105]:
# Check row and column count 
df_source.shape

(91455, 6)

In [204]:
# Change data type for Client Number
df_source['Client Number'] = df_source['Client Number'].astype('str')

In [205]:
# Check datatypes 
df_source.dtypes

Client Number            object
Client Type              object
Client Sex               object
Client Date of Birth      int64
Client Marital Status    object
Client Name              object
dtype: object

In [193]:
# Import Customer Life Time Value Data

df_cltv = pd.read_csv('C:\ProgramData\Sisense\PrismServer\ElastiCubeData\EX_SOURCE_CLTVSCORE_LIFE.csv', usecols = ['clntnum', 'cltvband','client_name'])

In [194]:
# Rename column names
df_cltv = df_cltv.rename(columns={"clntnum":"Client Number", "cltvband":"CLTV Band"})

In [195]:
# Check row and column count 
df_cltv.shape

(694, 3)

In [196]:
# Check datatypes 
df_cltv.dtypes

Client Number     int64
CLTV Band        object
client_name      object
dtype: object

In [200]:
# Change data type for Client Number
df_cltv['Client Number'] = df_cltv['Client Number'].astype(str)

In [201]:
df_cltv.head()

Unnamed: 0,Client Number,CLTV Band,client_name
0,50000834,medium value,Payer Tham PVM
1,50001181,high value,John James
2,50002643,high value,Gaurav Singh
3,50002679,low value,Govind Pal
4,50002754,low value,Mohit Kumar


In [207]:
# Merge source dataframe and cltv dataframe

df_merge_01 = pd.merge(df_source, df_cltv, on="Client Number", how="inner")

In [208]:
# Check for null columns

df_merge_01.isnull().sum()

Client Number            0
Client Type              0
Client Sex               0
Client Date of Birth     0
Client Marital Status    3
Client Name              0
CLTV Band                0
client_name              0
dtype: int64

In [None]:
# Check size of the dataframe

df_merge_01.shape()

In [215]:
# Inspect first few rows

df_merge_01.head()

Unnamed: 0,Client Number,Client Type,Client Sex,Client Date of Birth,Client Marital Status,Client Name,CLTV Band,client_name
0,50000811,P,F,19800101,M,Robin Williams,high value,RTA Ashish
1,50000834,P,M,19600101,S,Payer Tham PVM,medium value,Payer Tham PVM
2,50000847,P,M,19800101,M,BOSCO DON,high value,Drake willson
3,50000908,P,M,19850715,S,John Anderson,high value,NINJA RUL
4,50000913,P,M,19860126,S,John Smith,medium value,NINJA RUL


In [232]:
# Import Customer Propensity to Buy Data

df_propensity = pd.read_csv('C:\ProgramData\Sisense\PrismServer\ElastiCubeData\EX_SOURCE_CUSTOMER_PROPENSITY_LIFE.csv', usecols=['CLIENT_NUMBER','buy'])

In [233]:
# Rename column names
df_propensity = df_propensity.rename(columns={"CLIENT_NUMBER":"Client Number", "buy":"Propensity"})

In [234]:
# Check row and column count 
df_propensity.shape

(1114, 2)

In [235]:
# Check datatypes 
df_propensity.dtypes

Client Number      int64
Propensity       float64
dtype: object

In [236]:
# Inspect first few rows

df_propensity.head()

Unnamed: 0,Client Number,Propensity
0,1588,0.155495
1,1685,0.147082
2,1753,0.063281
3,1762,0.125803
4,1791,0.134278


In [238]:
# Change data type for Client Number
df_propensity['Client Number'] = df_propensity['Client Number'].astype(str)

In [239]:
# Pad leading zeroes in case client number does not have them

df_propensity['Client Number']=df_propensity['Client Number'].apply(lambda i: '{0:0>8}'.format(i))

In [240]:
# Merge source dataframe and cltv dataframe

df_merge_02 = pd.merge(df_merge_01, df_propensity, on="Client Number", how="inner")

In [241]:
# Check for null columns

df_merge_02.isnull().sum()

Client Number            0
Client Type              0
Client Sex               0
Client Date of Birth     0
Client Marital Status    0
Client Name              0
CLTV Band                0
client_name              0
Propensity               0
dtype: int64

In [242]:
# Check size of the dataframe

df_merge_02.shape

(355, 9)

In [250]:
# Inspect first few rows 

df_merge_02.head(5)

Unnamed: 0,Client Number,Client Type,Client Sex,Client Date of Birth,Client Marital Status,Client Name,CLTV Band,client_name,Propensity,Continent
0,50000847,P,M,19800101,M,BOSCO DON,high value,Drake willson,0.172292,Asia
1,50001036,P,M,19860101,M,Way Tily,low value,NINJA RUL,0.06322,Asia
2,50001209,P,M,19850715,S,John Anderson,high value,Andy F Bambang,0.066643,Europe
3,50001357,P,M,19880520,S,Pit Test,medium value,MOHIT SHARMA,0.070354,Asia
4,50001467,P,F,19900101,M,Manoj Sheetal,medium value,Ansh TEN,0.168367,Asia
5,50001495,P,M,19810529,Z,Parth Gevik,high value,Raj Mathur,0.123201,Asia
6,50001652,P,M,19810529,M,Personal Client Personal Client,medium value,John Smith,0.046857,Asia
7,50001745,P,M,19830404,M,Ishmeet Singh,high value,Drake willson,0.31954,Africa
8,50001817,P,M,19800405,S,Matt Dimitrov,low value,Steve Roger,0.096104,Europe
9,50002103,P,M,19970501,M,Nishant Saxena,low value,John RTA,0.031358,Europe


In [324]:
# Read csv which has Indian Population and City details

india_geo = pd.read_csv('C:\ProgramData\Sisense\PrismServer\ElastiCubeData\india.csv', usecols = ['City','Lat','Lng','Population'])

In [325]:
# Inspect first few rows

india_geo.head()

Unnamed: 0,City,Lat,Lng,Population
0,Mumbai,18.987807,72.836447,18978000.0
1,Delhi,28.651952,77.231495,15926000.0
2,Kolkata,22.562627,88.363044,14787000.0
3,Chennai,13.084622,80.248357,7163000.0
4,Bengalūru,12.977063,77.587106,6787000.0


In [326]:
# Get all cities in a list

cities=india_geo['City'].tolist()

In [337]:
# Merge it with second dataframe

df_merge_02['City'] = np.random.choice(list(cities), len(df_merge_02))

In [338]:
# Merge both datframes to get a new dataframe

df_merge_03 = pd.merge(df_merge_02, india_geo, on="City", how="inner")

In [351]:
# Inspect first few rows

df_merge_03.head()

Unnamed: 0,Client Number,Client Type,Client Sex,Client Date of Birth,Client Marital Status,Client Name,CLTV Band,client_name,Propensity,City,Lat,Lng,Population,Tier,Population Class,Customer Age,Profession Category
0,50000847,P,M,19800101,M,BOSCO DON,high value,Drake willson,0.172292,Bhāgalpur,25.244462,86.971832,361548.0,Tier-1,Urban,46,Blue Collar Job
1,50001036,P,M,19860101,M,Way Tily,low value,NINJA RUL,0.06322,Nāra,21.203096,79.089284,2454000.0,Tier-1,Metropoliton,58,Blue Collar Job
2,50030482,P,M,19800101,M,Ansh TEN,high value,Pal Inder,0.067246,Nāra,21.203096,79.089284,2454000.0,Tier-1,Metropoliton,57,Blue Collar Job
3,50052159,P,M,19850101,S,RTA Ashish,high value,Anthony Lim,0.091467,Nāra,21.203096,79.089284,2454000.0,Tier-1,Metropoliton,55,Other
4,50001209,P,M,19850715,S,John Anderson,high value,Andy F Bambang,0.066643,Gaya,24.796858,85.003852,423692.0,Tier-1,Urban,64,White Collar Job


In [344]:
# Add a new column 'Tier' based on population (based on https://en.wikipedia.org/wiki/Classification_of_Indian_cities)
conditions = [
    (df_merge_03['Population'] >0 ) & (df_merge_03['Population'] < 5000),
    (df_merge_03['Population'] >= 5000) & (df_merge_03['Population'] < 9999),
    (df_merge_03['Population'] >= 9999) & (df_merge_03['Population'] < 19999),
    (df_merge_03['Population'] >= 20000) & (df_merge_03['Population'] < 49999),
    (df_merge_03['Population'] >= 50000) & (df_merge_03['Population'] < 99999),
    (df_merge_03['Population'] >= 100000)]
choices = ['Tier-6', 'Tier-5', 'Tier-4', 'Tier-3', 'Tier-2', 'Tier-1']

df_merge_03['Tier'] = np.select(conditions, choices, default='Tier-6')

In [345]:
# Add a new column 'Population_Classification' based on population (based on https://en.wikipedia.org/wiki/Classification_of_Indian_cities)
conditions = [
    (df_merge_03['Population'] >0 ) & (df_merge_03['Population'] < 9999),
    (df_merge_03['Population'] >= 9999) & (df_merge_03['Population'] < 99999),
    (df_merge_03['Population'] >= 99999) & (df_merge_03['Population'] < 999999),
    (df_merge_03['Population'] >= 999999)]
choices = ['Rural', 'Semi-Urban', 'Urban', 'Metropolitan']

df_merge_03['Population Class'] = np.select(conditions, choices, default='Rural')

In [348]:
# Add column for customer Age
df_merge_03['Customer Age'] = np.random.randint(18,70,size=len(df_merge_03))

In [350]:
# Add column for customer job profile
profession = ['White Collar Job', 'Blue Collar Job', 'Other']
df_merge_03['Profession Category'] = np.random.choice(list(profession), len(df_merge_03))

In [352]:
# Save the output in a csv file
df_merge_03.to_csv('EX_SOURCE_CUSTOMER_SEGMENTATION_LIFE.csv', index = None, header=True)