## Life Customer Segmentation
## Data used for analysis is based on Integral Life Admin System

### The SQL query in below cells will retreive data based on database credentials provide. Right now these credentials have been left empty for security reasons. Please enter relevant details before running this notebook further.

In [1]:
# Import Packages
import numpy as np

import pandas as pd
import pyodbc 

# Settings to view all columns and rows
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

## Import Data

In [2]:
# Enter ip address and port number of the system where the database resides.
server   = '10.0.3.98'
database = 'INT77DB_R212'
username = 'sisensedb_user'
password = 'Sisense12#$'

In [3]:
# Add appropriate driver name
cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
cursor = cnxn.cursor()

OperationalError: ('08001', '[08001] [Microsoft][ODBC SQL Server Driver][DBNETLIB]SQL Server does not exist or access denied. (17) (SQLDriverConnect); [08001] [Microsoft][ODBC SQL Server Driver][DBNETLIB]ConnectionOpen (Connect()). (53)')

In [None]:
# Run Query
query = '''
select 
q1.clntnum as 'Client Number',
q1.clttype as 'Client Type' ,
q1.cltsex  as 'Client Sex' ,
q1.cltdob  as 'Client Date of Birth',
q1.marryd  as 'Client Marital Status',
q1.[client name] as 'Client Name'
from

(select clntnum, clttype, ( rtrim(givname) + ' ' + rtrim(surname) ) as 'client name', cltsex, cltdob, marryd from vm1dta.clntpf where validflag = '1')q1
order by q1.clntnum
'''

In [None]:
# Read data
df_source = pd.read_sql(query,cnxn)

# Close the cursor
cursor.close() 

In [None]:
# Check row and column count 
df_source.shape

In [None]:
# Change data type for Client Number
df_source['Client Number'] = df_source['Client Number'].astype('str')

In [None]:
# Check datatypes 
df_source.dtypes

In [None]:
# Import Customer Life Time Value Data

df_cltv = pd.read_csv('C:\ProgramData\Sisense\PrismServer\ElastiCubeData\EX_SOURCE_CLTVSCORE_LIFE.csv', usecols = ['clntnum', 'cltvband','client_name'])

In [None]:
# Rename column names
df_cltv = df_cltv.rename(columns={"clntnum":"Client Number", "cltvband":"CLTV Band"})

In [None]:
# Check row and column count 
df_cltv.shape

In [None]:
# Check datatypes 
df_cltv.dtypes

In [None]:
# Change data type for Client Number
df_cltv['Client Number'] = df_cltv['Client Number'].astype(str)

In [None]:
df_cltv.head()

In [None]:
# Merge source dataframe and cltv dataframe

df_merge_01 = pd.merge(df_source, df_cltv, on="Client Number", how="inner")

In [None]:
# Check for null columns

df_merge_01.isnull().sum()

In [None]:
# Check size of the dataframe

df_merge_01.shape()

In [None]:
# Inspect first few rows

df_merge_01.head()

In [None]:
# Import Customer Propensity to Buy Data

df_propensity = pd.read_csv('C:\ProgramData\Sisense\PrismServer\ElastiCubeData\EX_SOURCE_CUSTOMER_PROPENSITY_LIFE.csv', usecols=['CLIENT_NUMBER','buy'])

In [None]:
# Rename column names
df_propensity = df_propensity.rename(columns={"CLIENT_NUMBER":"Client Number", "buy":"Propensity"})

In [None]:
# Check row and column count 
df_propensity.shape

In [None]:
# Check datatypes 
df_propensity.dtypes

In [None]:
# Inspect first few rows

df_propensity.head()

In [None]:
# Change data type for Client Number
df_propensity['Client Number'] = df_propensity['Client Number'].astype(str)

In [None]:
# Pad leading zeroes in case client number does not have them

df_propensity['Client Number']=df_propensity['Client Number'].apply(lambda i: '{0:0>8}'.format(i))

In [None]:
# Merge source dataframe and cltv dataframe

df_merge_02 = pd.merge(df_merge_01, df_propensity, on="Client Number", how="inner")

In [None]:
# Check for null columns

df_merge_02.isnull().sum()

In [None]:
# Check size of the dataframe

df_merge_02.shape

In [None]:
# Inspect first few rows 

df_merge_02.head(5)

In [None]:
# Read csv which has Indian Population and City details

india_geo = pd.read_csv('C:\ProgramData\Sisense\PrismServer\ElastiCubeData\india.csv', usecols = ['City','Lat','Lng','Population'])

In [None]:
# Inspect first few rows

india_geo.head()

In [None]:
# Get all cities in a list

cities=india_geo['City'].tolist()

In [None]:
# Merge it with second dataframe

df_merge_02['City'] = np.random.choice(list(cities), len(df_merge_02))

In [None]:
# Merge both datframes to get a new dataframe

df_merge_03 = pd.merge(df_merge_02, india_geo, on="City", how="inner")

In [None]:
# Inspect first few rows

df_merge_03.head()

In [None]:
# Add a new column 'Tier' based on population (based on https://en.wikipedia.org/wiki/Classification_of_Indian_cities)
conditions = [
    (df_merge_03['Population'] >0 ) & (df_merge_03['Population'] < 5000),
    (df_merge_03['Population'] >= 5000) & (df_merge_03['Population'] < 9999),
    (df_merge_03['Population'] >= 9999) & (df_merge_03['Population'] < 19999),
    (df_merge_03['Population'] >= 20000) & (df_merge_03['Population'] < 49999),
    (df_merge_03['Population'] >= 50000) & (df_merge_03['Population'] < 99999),
    (df_merge_03['Population'] >= 100000)]
choices = ['Tier-6', 'Tier-5', 'Tier-4', 'Tier-3', 'Tier-2', 'Tier-1']

df_merge_03['Tier'] = np.select(conditions, choices, default='Tier-6')

In [None]:
# Add a new column 'Population_Classification' based on population (based on https://en.wikipedia.org/wiki/Classification_of_Indian_cities)
conditions = [
    (df_merge_03['Population'] >0 ) & (df_merge_03['Population'] < 9999),
    (df_merge_03['Population'] >= 9999) & (df_merge_03['Population'] < 99999),
    (df_merge_03['Population'] >= 99999) & (df_merge_03['Population'] < 999999),
    (df_merge_03['Population'] >= 999999)]
choices = ['Rural', 'Semi-Urban', 'Urban', 'Metropolitan']

df_merge_03['Population Class'] = np.select(conditions, choices, default='Rural')

In [None]:
# Add column for customer Age
df_merge_03['Customer Age'] = np.random.randint(18,70,size=len(df_merge_03))

In [None]:
# Add column for customer job profile
profession = ['White Collar Job', 'Blue Collar Job', 'Other']
df_merge_03['Profession Category'] = np.random.choice(list(profession), len(df_merge_03))

In [None]:
# Save the output in a csv file
df_merge_03.to_csv('EX_SOURCE_CUSTOMER_SEGMENTATION_LIFE.csv', index = None, header=True)