In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Content**

**Customer segmentation is one of those important aspects that a business has to carefully consider before formulating products or services to it's customer base Pitching the right message to the right customer and at the right time has been the objective for all banks.Banks look at customer segmentation to gain insight, on how to decide on specific offers, improve customer service, and understand customer behaviour & more. The success or failure of a marketing campaign depends on how customers are segmented. Based on the customer segmentation, banks unleash product recommendations like saving plans, loans, wealth management, etc. on target customer groups.**

**OBJECTIVE:** 
**To segment and analyze bank customers using Kmeans model so as to understand the kind of clients a bank has which can then be used in developing profitable products that can generate more revenue to the bank.
This is a small sample Dataset that summarizes the usage behavior of about nearly 1000 active credit card holders during the last 6 months.
The Unit for the income is in thausands of dollars.**

In [None]:
# Importation of more libraries for formatting and visualizations
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
bank_data = pd.read_csv('/kaggle/input/customer-segmentation/customer_segmentation.csv')

**DATA CLEANING,FORMATTING AND INSPECTION**

In [None]:
bank_data.shape

In [None]:
bank_data.head()

In [None]:
#lets drop the unnamed column from the data since it's not informative and it's very similar to the customer Id
bank_data.drop('Unnamed: 0', inplace = True,axis=1)

In [None]:
bank_data.head()

In [None]:
#detailed check on the data 
bank_data.info()

**From the snapshot of the data above, we can see that there are some missing values in the defaulted column. we can also note that we only have numerical variables**

In [None]:
#brief summary on the numerical attributes
bank_data.describe()

**One of the noticable columns is one for income, there is some bit of skewness in this column. we shall need to address this at some point.We can see that the dataset is mostly made up of medium aged young people of average age about 35 years.**

In [None]:
#Lets fill in the missing values in the defaulted column with the median
bank_data['Defaulted'] = bank_data.Defaulted.fillna(value=bank_data.Defaulted.median)

In [None]:
#code shows no more missing values.
bank_data.Defaulted.isnull().any()

**EXPLORATORY DATA ANALYSIS.**

**Our segmentation is going to majorly focus on some numerical variables of our interest.
Lets perform some data visualization for some  numerical variables of interest**

In [None]:
num_features =['Age','Edu','Years Employed','Income','Card Debt','DebtIncomeRatio']

In [None]:
num_data = bank_data[num_features]

In [None]:
# Plotting histograms plots for the variables
num_data.hist(bins=30, color = 'blue',figsize=(12,12))
plt.show()

**From the look at the histograms, there are some columns that are skewed.**

In [None]:
# Plotting Histogram  for the Other debt variable
bank_data['Other Debt'].hist(bins=30, color = 'blue',figsize=(6,4))
plt.show()

**Plot above shows that the other debt variable is skewed to the right.we will remove the skewness from the variable before clustering.**

In [None]:
#feature engineering 
#lets create a new feature Total_Debt from Card Debt and Other Debt
bank_data['Total_Debt'] = bank_data['Card Debt'] + bank_data['Other Debt']

In [None]:
#snap shot of the data
bank_data.head()

In [None]:
bank_data.skew()

**From the code above, we can see some columns which are more skewed than others. we are going to reduce skewness in the Total debt, income  and Age  columns since they are the features that we are going to use for clustering . **

**Our goal is to Segment our customers into favourable clusters
We are going to make use of  variables; Age, Income and Total Debt
We shall make use of the Kmeans clustering model to perform the segmentation.
For better performance of the model,we shall need to scale and standardize the data of the columns of  interest**

In [None]:
bank_data['Total_Debt_log'] = np.log(bank_data['Total_Debt'])
bank_data['Income_log'] = np.log(bank_data['Income'])
bank_data['Age_log'] = np.log(bank_data['Age'])

In [None]:
#snapshot of the new dataset.
bank_data.head(2)

**CLUSTER GENERATION**

In [None]:
cluster_cols = ['Age_log','Total_Debt_log','Income_log']
cluster_data = bank_data[cluster_cols]

In [None]:
#standizing our data to create  mean centered version of our dataset
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
cluster_scaled = scaler.fit_transform(cluster_data)

In [None]:
#Cluster generation
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm

X = cluster_scaled

cluster_centers = dict()

for n_clusters_K in range(3,5):
    fig, ax1 = plt.subplots(figsize=(8,5))
    
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(X) + (n_clusters_K + 1) * 10])

    cluster_obj = KMeans(n_clusters=n_clusters_K, random_state=10)
    cluster_labels = cluster_obj.fit_predict(X)

    silhouette_avg = silhouette_score(X, cluster_labels)
    cluster_centers.update({n_clusters_K :{
                                        'cluster_center':cluster_obj.cluster_centers_,
                                        'silhouette_score':silhouette_avg,
                                        'labels':cluster_labels}
                           })

    sample_silhouette_values = silhouette_samples(X, cluster_labels)
    y_lower = 10
    for i in range(n_clusters_K):
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
        
        color = cm.nipy_spectral(float(i) / n_clusters_K)
        
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various cluster values of K.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
    ax1.set_yticks([])
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
    
    plt.show()
        

In [None]:
#recomputing cluster lables with various values of clusters_k
Values_of_K = range(3,15)
results =[]
for k in Values_of_K:
    cluster_obj2 = KMeans(n_clusters=k, random_state=1)
    cluster_labels2 = cluster_obj2.fit_predict(X)
    silhouette_avg2 = silhouette_score(X, cluster_labels2)
    results.append([k,silhouette_avg2])

result = pd.DataFrame(results, columns=["n_clusters_K", "silhouette_score"])

In [None]:
result.head(10)

In [None]:
plt.figure(figsize = (10,8))
plt.plot(result.n_clusters_K,result.silhouette_score,marker = 'o')
plt.title('Plot that shows the variation of silhouette_score with various values of k')
plt.xlabel('Values of Clusters K')
plt.ylabel('Values of silhouette_score')

**From the silhouette plots above, we can see that the 3 clustered segments has the highest  silhouette_score than the 4 clustered segments 
as seen in the visualizations above, Secondly the 3 clustered segments generaly do not have overlaps between it's clusters unlike those of cluster 4 which have overlaps between it's segments. Therefore we will consider the three clustered segment.**

**CLUSTER ANALYSIS**

In [None]:
for i in range(3,5):
    print(f"for a {i} numbered  cluster")
    original_data = scaler.inverse_transform(cluster_centers[i]['cluster_center'])
    print(pd.DataFrame(np.exp(original_data),columns = cluster_cols))
    print("Silhouette score for cluster {} is {:.3f}".format(i,cluster_centers[i]['silhouette_score']))
    print()

**When we look at the results of the clustering process, we can infer some insights**

**Lets consider the 3 cluster segment**

**We can see 3 clusters with a  noticable difference in age groups of cluster 0 and 2 while  slight difference in age group between cluster 0 and 1**

**Generally older people tend to have a higher total debt probably because of high income that they have and the reverse is true with the younger population**

**We can also see the same trend being replicated in 4 clustered configuration.**

In [None]:
#Lets Assign Cluster labels to the clusters
labels = cluster_centers[3]['labels']   
cluster_data['3cluster_labels'] = labels
labels = cluster_centers[4]['labels']
cluster_data['4cluster_labels'] = labels

In [None]:
cluster_data.head()

In [None]:
import plotly as py
import plotly.io as pio
pio.renderers.default='notebook'
import plotly.graph_objs as go
py.offline.init_notebook_mode()

x_data = ['Cluster 1','Cluster 2','Cluster 3']
cutoff_quantile = 70 # Used to avoid extrem outliers that may interfere in making of good observations due to noise
field_to_plot = 'Age_log'

y0 = cluster_data[cluster_data['3cluster_labels']==0][field_to_plot].values
y0 = y0[y0<np.percentile(y0, cutoff_quantile)]
y1 = cluster_data[cluster_data['3cluster_labels']==1][field_to_plot].values
y1 = y1[y1<np.percentile(y1, cutoff_quantile)]
y2 = cluster_data[cluster_data['3cluster_labels']==2][field_to_plot].values
y2 = y2[y2<np.percentile(y2, cutoff_quantile)]

y_data = [y0,y1,y2]

colors = ['red','blue','green']
traces = []

for xd, yd, cls in zip(x_data, y_data, colors):
        traces.append(go.Box(
            y=yd,
            name=xd,
            boxpoints=False,
            jitter=0.5,
            whiskerwidth=0.2,
            fillcolor=cls,
            marker=dict(
                size=2,
            ),
            line=dict(width=1),
        ))

layout = go.Layout(
    title=(f'Difference in {field_to_plot} from cluster to cluster'),
    yaxis=dict(
        autorange=True,
        showgrid=True,
        zeroline=True,
        dtick=50,
        gridcolor='black',
        gridwidth=0.1,
        zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2,
    ),
    margin=dict(
        l=40,
        r=30,
        b=80,
        t=100,
    ),
    paper_bgcolor='white',
    plot_bgcolor='white',
    showlegend=False
)
fig = go.Figure(data=traces, layout=layout)
py.offline.iplot(fig)

**As seen previously cluster 2 and 3 have  the highest average age hence having clients w earning the most income as well has having the most total debt.
The difference in other clusters is minimum though follows the same trend**

In [None]:
import plotly as py
import plotly.io as pio
pio.renderers.default='notebook'
import plotly.graph_objs as go
py.offline.init_notebook_mode()

x_data = ['Cluster 1','Cluster 2','Cluster 3']
cutoff_quantile = 70 # Used to avoid extrem outliers that may interfere in making of good observations due to noise
field_to_plot = 'Total_Debt_log'

y0 = cluster_data[cluster_data['3cluster_labels']==0][field_to_plot].values
y0 = y0[y0<np.percentile(y0, cutoff_quantile)]
y1 = cluster_data[cluster_data['3cluster_labels']==1][field_to_plot].values
y1 = y1[y1<np.percentile(y1, cutoff_quantile)]
y2 = cluster_data[cluster_data['3cluster_labels']==2][field_to_plot].values
y2 = y2[y2<np.percentile(y2, cutoff_quantile)]

y_data = [y0,y1,y2]

colors = ['red','blue','green']
traces = []

for xd, yd, cls in zip(x_data, y_data, colors):
        traces.append(go.Box(
            y=yd,
            name=xd,
            boxpoints=False,
            jitter=0.5,
            whiskerwidth=0.2,
            fillcolor=cls,
            marker=dict(
                size=2,
            ),
            line=dict(width=1),
        ))

layout = go.Layout(
    title=(f'Difference in {field_to_plot} from cluster to cluster'),
    yaxis=dict(
        autorange=True,
        showgrid=True,
        zeroline=True,
        dtick=50,
        gridcolor='black',
        gridwidth=0.1,
        zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2,
    ),
    margin=dict(
        l=40,
        r=30,
        b=80,
        t=100,
    ),
    paper_bgcolor='white',
    plot_bgcolor='white',
    showlegend=False
)
fig = go.Figure(data=traces, layout=layout)
py.offline.iplot(fig)

In [None]:
import plotly as py
import plotly.io as pio
pio.renderers.default='notebook'
import plotly.graph_objs as go
py.offline.init_notebook_mode()

x_data = ['Cluster 1','Cluster 2','Cluster 3','Cluster 4']
cutoff_quantile = 70 # Used to avoid extrem outliers that may interfere in making of good observations due to noise
field_to_plot = 'Age_log'

y0 = cluster_data[cluster_data['4cluster_labels']==0][field_to_plot].values
y0 = y0[y0<np.percentile(y0, cutoff_quantile)]
y1 = cluster_data[cluster_data['4cluster_labels']==1][field_to_plot].values
y1 = y1[y1<np.percentile(y1, cutoff_quantile)]
y2 = cluster_data[cluster_data['4cluster_labels']==2][field_to_plot].values
y2 = y2[y2<np.percentile(y2, cutoff_quantile)]
y3 = cluster_data[cluster_data['4cluster_labels']==3][field_to_plot].values
y3 = y3[y3<np.percentile(y3, cutoff_quantile)]

y_data = [y0,y1,y2,y3]

colors = ['red','blue','green','cyan']
traces = []

for xd, yd, cls in zip(x_data, y_data, colors):
        traces.append(go.Box(
            y=yd,
            name=xd,
            boxpoints=False,
            jitter=0.5,
            whiskerwidth=0.2,
            fillcolor=cls,
            marker=dict(
                size=2,
            ),
            line=dict(width=1),
        ))

layout = go.Layout(
    title=(f'Difference in {field_to_plot} from cluster to cluster'),
    yaxis=dict(
        autorange=True,
        showgrid=True,
        zeroline=True,
        dtick=50,
        gridcolor='black',
        gridwidth=0.1,
        zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2,
    ),
    margin=dict(
        l=40,
        r=30,
        b=80,
        t=100,
    ),
    paper_bgcolor='white',
    plot_bgcolor='white',
    showlegend=False
)
fig = go.Figure(data=traces, layout=layout)
py.offline.iplot(fig)


**As seen previously cluster 1 and 4 have  the highest average age hence having clients w earning the most income as well has having the most total debt.
The difference in other clusters is minimum though follows the same trend**

In [None]:
import plotly as py
import plotly.io as pio
pio.renderers.default='notebook'
import plotly.graph_objs as go
py.offline.init_notebook_mode()


x_data = ['Cluster 1','Cluster 2','Cluster 3','Cluster 4']
cutoff_quantile = 70 # Used to avoid extrem outliers that may interfere in making of good observations due to noise
field_to_plot = 'Total_Debt_log'

y0 = cluster_data[cluster_data['4cluster_labels']==0][field_to_plot].values
y0 = y0[y0<np.percentile(y0, cutoff_quantile)]
y1 = cluster_data[cluster_data['4cluster_labels']==1][field_to_plot].values
y1 = y1[y1<np.percentile(y1, cutoff_quantile)]
y2 = cluster_data[cluster_data['4cluster_labels']==2][field_to_plot].values
y2 = y2[y2<np.percentile(y2, cutoff_quantile)]
y3 = cluster_data[cluster_data['4cluster_labels']==3][field_to_plot].values
y3 = y3[y3<np.percentile(y3, cutoff_quantile)]

y_data = [y0,y1,y2,y3]

colors = ['red','blue','green','cyan']
traces = []

for xd, yd, cls in zip(x_data, y_data, colors):
        traces.append(go.Box(
            y=yd,
            name=xd,
            boxpoints=False,
            jitter=0.5,
            whiskerwidth=0.2,
            fillcolor=cls,
            marker=dict(
                size=2,
            ),
            line=dict(width=1),
        ))

layout = go.Layout(
    title=(f'Difference in {field_to_plot} from cluster to cluster'),
    yaxis=dict(
        autorange=True,
        showgrid=True,
        zeroline=True,
        dtick=50,
        gridcolor='black',
        gridwidth=0.1,
        zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2,
    ),
    margin=dict(
        l=40,
        r=30,
        b=80,
        t=100,
    ),
    paper_bgcolor='white',
    plot_bgcolor='white',
    showlegend=False
)
fig = go.Figure(data=traces, layout=layout)
py.offline.iplot(fig)


**As seen previously, cluster 1 and 3 have the highest median Total debt which must be corresponding to those more older clients with a higher income range in thousands of dollars. Median Total debt of clusters 2 and 4 are similar which could be merged to one cluster.**

All feedback is welcome. you can also upvote this notebook. cheers.