# CUSTOMER ANALYTICS

- Master beginner and advanced customer analytics

- Learn the most important type of analysis applied by mid and large companies

- Gain access to a professional team of trainers with exceptional quant skills

- Wow interviewers by acquiring a highly desired skill

- Understand the fundamental marketing modeling theory: segmentation, targeting, positioning, marketing mix, and price elasticity;

- Apply segmentation on your customers, starting from raw data and reaching final customer segments;

- Perform K-means clustering with a customer analytics focus;

- Apply Principal Components Analysis (PCA) on your data to preprocess your features;

- Combine PCA and K-means for even more professional customer segmentation;

- Deploy your models on a different dataset;

- Learn how to model purchase incidence through probability of purchase elasticity;

- Model brand choice by exploring own-price and cross-price elasticity;

- Complete the purchasing cycle by predicting purchase quantity elasticity

- Carry out a black box deep learning model with TensorFlow 2.0 to predict purchasing behavior with unparalleled accuracy

- Be able to optimize your neural networks to enhance results

# Libs

In [1]:
import numpy as np
import pandas as pd
import scipy

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.preprocessing import StandardScaler

from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import KMeans

from sklearn.decomposition import PCA

import pickle

# Dataset

In [2]:
df_segmentation = pd.read_csv('data.csv')

# Analysis

In [3]:
df_segmentation.head()

Unnamed: 0,ID,Sex,Marital status,Age,Education,Income,Occupation,Settlement size
0,100000001,0,0,67,2,124670,1,2
1,100000002,1,1,22,1,150773,1,2
2,100000003,0,0,49,1,89210,0,0
3,100000004,0,0,45,1,171565,1,1
4,100000005,0,0,53,1,149031,1,1


In [4]:
df_segmentation.describe()

Unnamed: 0,ID,Sex,Marital status,Age,Education,Income,Occupation,Settlement size
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,100001000.0,0.457,0.4965,35.909,1.038,120954.419,0.8105,0.739
std,577.4946,0.498272,0.500113,11.719402,0.59978,38108.824679,0.638587,0.812533
min,100000000.0,0.0,0.0,18.0,0.0,35832.0,0.0,0.0
25%,100000500.0,0.0,0.0,27.0,1.0,97663.25,0.0,0.0
50%,100001000.0,0.0,0.0,33.0,1.0,115548.5,1.0,1.0
75%,100001500.0,1.0,1.0,42.0,1.0,138072.25,1.0,1.0
max,100002000.0,1.0,1.0,76.0,3.0,309364.0,2.0,2.0


### Correlation

In [5]:
df_segmentation.corr()

Unnamed: 0,ID,Sex,Marital status,Age,Education,Income,Occupation,Settlement size
ID,1.0,0.328262,0.074403,-0.085246,0.012543,-0.303217,-0.291958,-0.378445
Sex,0.328262,1.0,0.566511,-0.182885,0.244838,-0.195146,-0.202491,-0.300803
Marital status,0.074403,0.566511,1.0,-0.213178,0.374017,-0.073528,-0.02949,-0.097041
Age,-0.085246,-0.182885,-0.213178,1.0,0.654605,0.34061,0.108388,0.119751
Education,0.012543,0.244838,0.374017,0.654605,1.0,0.233459,0.064524,0.034732
Income,-0.303217,-0.195146,-0.073528,0.34061,0.233459,1.0,0.680357,0.490881
Occupation,-0.291958,-0.202491,-0.02949,0.108388,0.064524,0.680357,1.0,0.571795
Settlement size,-0.378445,-0.300803,-0.097041,0.119751,0.034732,0.490881,0.571795,1.0


In [6]:
plt.figure(figsize = (12, 9))
s = sns.heatmap(df_segmentation.corr(),
               annot = True, 
               cmap = 'RdBu',
               vmin = -1, 
               vmax = 1)
s.set_yticklabels(s.get_yticklabels(), rotation = 0, fontsize = 12)
s.set_xticklabels(s.get_xticklabels(), rotation = 90, fontsize = 12)
plt.title('Correlation Heatmap')
plt.show()

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/daianeklein/miniconda3/envs/ds-projects/lib/python3.9/site-packages/PIL/ImageFile.py", line 504, in _save
    fh = fp.fileno()
AttributeError: '_idat' object has no attribute 'fileno'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/daianeklein/miniconda3/envs/ds-projects/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3441, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/wv/w46jcg9n1wg9gv31ph4v6nvr0000gn/T/ipykernel_967/2450017810.py", line 10, in <module>
    plt.show()
  File "/Users/daianeklein/miniconda3/envs/ds-projects/lib/python3.9/site-packages/matplotlib/pyplot.py", line 378, in show
    return _backend_mod.show(*args, **kwargs)
  File "/Users/daianeklein/miniconda3/envs/ds-projects/lib/python3.9/site-packages/matplotlib_inline/backend_inline.py", line 41, in show
    display(
  File "/Users/daianek

TypeError: object of type 'NoneType' has no len()

In [None]:
# raw data
plt.figure(figsize = (12, 9))
plt.scatter(df_segmentation['Age'], df_segmentation['Income'])
plt.xlabel('Age')
plt.ylabel('Income')
plt.title('Visualization of raw data')

## Standardization

In [None]:
scaler = StandardScaler()
segmentation_std = scaler.fit_transform(df_segmentation)

## Hierarquical Clustering

In [None]:
hier_clust = linkage(segmentation_std, method = 'ward')

In [None]:
plt.figure(figsize = (12,9))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Observations')
plt.ylabel('Distance')
dendrogram(hier_clust,
           truncate_mode = 'level',
           p = 5,
           show_leaf_counts = False,
           no_labels = True)
plt.show()

## K-means Clustering

In [None]:
wcss = []

for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(segmentation_std)
    wcss.append(kmeans.inertia_)

In [None]:
wcss

In [None]:
plt.figure(figsize = (10,8))
plt.plot(range(1, 11), wcss, marker = 'o', linestyle = '--')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.title('K-means Clustering')
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 4, init = 'k-means++', random_state = 42)
kmeans.fit(segmentation_std)


### Results

In [None]:
kmeans.labels_

In [None]:
set(kmeans.labels_)

In [None]:
df_segm_kmeans = df_segmentation.copy()
df_segm_kmeans['Segment K-means'] = kmeans.labels_

In [None]:
df_segm_analysis = df_segm_kmeans.groupby(['Segment K-means']).mean()
df_segm_analysis

In [None]:
df_segm_analysis['N Obs'] = df_segm_kmeans[['Segment K-means','Sex']].groupby(['Segment K-means']).count()
df_segm_analysis['Prop Obs'] = df_segm_analysis['N Obs'] / df_segm_analysis['N Obs'].sum()

In [None]:
df_segm_analysis

In [None]:
# rename cluster
df_segm_analysis.rename({0:'well-off',
                         1:'fewer-opportunities',
                         2:'standard',
                         3:'career focused'})

In [None]:
df_segm_kmeans['Labels'] = df_segm_kmeans['Segment K-means'].map({0:'well-off', 
                                                                  1:'fewer opportunities',
                                                                  2:'standard', 
                                                                  3:'career focused'})

In [None]:
x_axis = df_segm_kmeans['Age']
y_axis = df_segm_kmeans['Income']
plt.figure(figsize = (10, 8))
sns.scatterplot(x = x_axis, y = y_axis, hue = df_segm_kmeans['Labels'], palette = ['g', 'r', 'c', 'm'])
plt.title('Segmentation K-means')
plt.show()

# PCA

In [None]:
pca = PCA()

In [None]:
pca.fit(segmentation_std)

In [None]:
pca.explained_variance_ratio_

In [None]:
plt.figure(figsize = (12,9))
plt.plot(pca.explained_variance_ratio_.cumsum(), marker = 'o', linestyle = '--')
plt.title('Explained Variance by Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance');

In [None]:
pca = PCA(n_components = 3)

In [None]:
pca.fit(segmentation_std)

## PCA RESULTS

In [None]:
pca.components_

In [None]:
df_pca_comp = pd.DataFrame(data = pca.components_,
                           columns = df_segmentation.columns.values,
                           index = ['Component 1', 'Component 2', 'Component 3'])
df_pca_comp

In [None]:
sns.heatmap(df_pca_comp,
            vmin = -1, 
            vmax = 1,
            cmap = 'RdBu',
            annot = True)
plt.yticks([0, 1, 2], 
           ['Component 1', 'Component 2', 'Component 3'],
           rotation = 45,
           fontsize = 9);

In [None]:
pca.transform(segmentation_std)

In [None]:
scores_pca = pca.transform(segmentation_std)

## K-Means clustering with PCA

In [None]:
wcss = []
for i in range(1,11):
    kmeans_pca = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans_pca.fit(scores_pca)
    wcss.append(kmeans_pca.inertia_)

In [None]:
plt.figure(figsize = (10,8))
plt.plot(range(1, 11), wcss, marker = 'o', linestyle = '--')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.title('K-means with PCA Clustering')
plt.show()

In [None]:
kmeans_pca = KMeans(n_clusters = 4, init = 'k-means++', random_state = 42)

In [None]:
kmeans_pca.fit(scores_pca)

## K-Means clustering with PCA

In [None]:
df_segm_pca_kmeans = pd.concat([df_segmentation.reset_index(drop = True), pd.DataFrame(scores_pca)], axis = 1)
df_segm_pca_kmeans.columns.values[-3: ] = ['Component 1', 'Component 2', 'Component 3']
df_segm_pca_kmeans['Segment K-means PCA'] = kmeans_pca.labels_

In [None]:
df_segm_pca_kmeans

In [None]:
df_segm_pca_kmeans_freq = df_segm_pca_kmeans.groupby(['Segment K-means PCA']).mean()
df_segm_pca_kmeans_freq

In [None]:
df_segm_pca_kmeans_freq['N Obs'] = df_segm_pca_kmeans[['Segment K-means PCA','Sex']].groupby(['Segment K-means PCA']).count()
df_segm_pca_kmeans_freq['Prop Obs'] = df_segm_pca_kmeans_freq['N Obs'] / df_segm_pca_kmeans_freq['N Obs'].sum()
df_segm_pca_kmeans_freq = df_segm_pca_kmeans_freq.rename({0:'standard', 
                                                          1:'career focused',
                                                          2:'fewer opportunities', 
                                                          3:'well-off'})
df_segm_pca_kmeans_freq

In [None]:
df_segm_pca_kmeans['Legend'] = df_segm_pca_kmeans['Segment K-means PCA'].map({0:'standard', 
                                                          1:'career focused',
                                                          2:'fewer opportunities', 
                                                          3:'well-off'})

In [None]:
x_axis = df_segm_pca_kmeans['Component 2']
y_axis = df_segm_pca_kmeans['Component 1']
plt.figure(figsize = (10, 8))
sns.scatterplot(x = x_axis, y= y_axis, hue = df_segm_pca_kmeans['Legend'], palette = ['g', 'r', 'c', 'm'])
plt.title('Clusters by PCA Components')
plt.show()

In [None]:
x_axis_1 = df_segm_pca_kmeans['Component 3']
y_axis_1 = df_segm_pca_kmeans['Component 1']
plt.figure(figsize = (12, 9))
sns.scatterplot(x = x_axis_1, y = y_axis_1, hue = df_segm_pca_kmeans['Legend'], palette = ['g', 'r', 'c', 'm'])
plt.title('Clusters by PCA Components' )
plt.show()

In [None]:
x_axis_1 = df_segm_pca_kmeans['Component 3']
y_axis_1 = df_segm_pca_kmeans['Component 2']
plt.figure(figsize = (12, 9))
sns.scatterplot(x = x_axis_1, y = y_axis_1, hue = df_segm_pca_kmeans['Legend'], palette = ['g', 'r', 'c', 'm'])
plt.title('Clusters by PCA Components' )
plt.show()

# DATA EXPORT

In [None]:
pickle.dump(scaler, open('scaler.pickle', 'wb'))

In [None]:
pickle.dump(pca, open('pca.pickle', 'wb'))

In [None]:
pickle.dump(kmeans_pca, open('kmeans_pca.pickle', 'wb'))