In [52]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [53]:
pip install scikit-learn-extra

In [54]:
pip install pyclustering


# Importing necessary libraries

In [55]:
from sklearn.cluster import Birch
from sklearn_extra.cluster import KMedoids
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import KernelPCA
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns


# Data preprocessing
The objective is to first broadly understand the data infront of us with regards to statistical properties and features

In [56]:
df = pd.read_csv('/kaggle/input/customer-personality-analysis/marketing_campaign.csv',sep="\t")
df.head()

In [57]:
df.info()

In [58]:
df.describe()

From the dataset description, it is evident that the following two columns do not correspond to any significant information for further analysis and therefore, it can be conveniently dropped

In [59]:
df.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True)
df.columns

# Handling null values
From the above information, it is clear that only the income column has some null values, and since their number is not very high, we can replace them by the median income value

In [60]:
df['Income'] = df['Income'].fillna(df['Income'].median())
df.isnull().sum().sum()

# Feature engineering
* A column for age is prepared using the date of birth of each customer
* The food expenditure is combined for meat and fish products to form a non-veg column and the remaining are combined as a different column
* The marital status column is encoded to numerical value
* The number of kids and teens are combined to form the total number of children for a family

In [61]:
df['Age'] = 2022 - df['Year_Birth']
df['Non_Veg_Amt']= df['MntMeatProducts'] + df['MntFishProducts']
df['Others_Amt'] = df['MntWines'] + df['MntSweetProducts'] + df['MntFruits']
df.drop(['MntMeatProducts', 'MntFishProducts', 'MntWines','MntSweetProducts', 'MntFruits'], axis=1, inplace=True)
df.drop('Year_Birth', axis=1, inplace=True)

In [62]:
df['Education'].value_counts()

In [63]:
df['Marital_Status'].value_counts() 

In [64]:
df['Children'] = df['Kidhome']+df['Teenhome']
df['Family_Size'] = df['Children'] + df['Marital_Status'].replace({'Married':2, 'Together':2, 'Single':1, 'Divorced':1, 'Widow':1,
                                                                  'Alone':1, 'Absurd':1,'YOLO':1})
df.drop(['Kidhome','Teenhome'], axis=1, inplace=True)


In [65]:
df['Marital_Status'].replace({'Married':2, 'Together':2, 'Single':1, 'Divorced':1, 'Widow':1,
                                                                  'Alone':1, 'Absurd':1,'YOLO':1}, inplace=True)
df['Marital_Status'].value_counts()

# Exploratory data analysis
The objective here is to have a holistic understanding of the customer base in the market through specific queries. Athough, the dataset has several features, it can be understood that certain features like complain, acceptance of ad campaign have a particular significance behind this customer analysis study.

# How many people accepted the offer in each campaign?
Through this question, we can understand how successful each of the ad campaing has been with regards to customer acceptance

In [66]:
camp_lst = ['AcceptedCmp1', 'AcceptedCmp2','AcceptedCmp3','AcceptedCmp4','AcceptedCmp5','Response']
val_lst = []
for elm in camp_lst:
    cond = df[elm]==1
    val_lst.append(df.loc[cond, elm].count())
    
fig = plt.figure(figsize =(10, 10)) 
plt.bar(camp_lst, val_lst, color ='lightgreen',
        width = 0.4)
plt.grid(zorder = 0, axis = "y")
plt.xlabel("Campaign rounds", fontsize = 18, labelpad=15)
plt.xticks(rotation=90)
plt.ylabel("No. of customers",fontsize = 18, labelpad=15)
plt.title("No. of customers accepting the campaign")

plt.show()

# How many single and married customers accepted the offer in each campaign?

In [67]:
cond1 = df['Marital_Status']==1
cond2 = df['Marital_Status']==2
cond3 = df['AcceptedCmp1'] == 1

df.loc[(cond1) &(cond3)].shape[0]

sing_lst = []
marr_lst = []
c1 = df['Marital_Status']==1
c2 = df['Marital_Status']==2
for elm in camp_lst:
    c3 = df[elm]==1
    sing_lst.append(df.loc[(c1)& (c3)].shape[0])
    marr_lst.append(df.loc[(c2)&(c3)].shape[0])
    
X_axis = np.arange(len(camp_lst)) 
fig = plt.figure(figsize =(10, 10)) 
plt.bar(X_axis - 0.2, sing_lst, 0.4, color = 'lightblue',label = 'Single')
plt.bar(X_axis + 0.2, marr_lst, 0.4, label = 'Married')
plt.grid(zorder = 0, axis = "y")  
plt.xticks(X_axis, camp_lst)
plt.xlabel("Customer group",fontsize = 18, labelpad=15)
plt.ylabel("Number of customers", fontsize = 18, labelpad=15)
plt.title("Number of customers accepting offer in each campaign", fontsize = 18)
plt.xticks(rotation=90)
plt.legend()
plt.show()
    
    

# How many customers who have complained atleast once in last two years have accepted the campaign offer?

In [68]:
df['Complain'].value_counts()

In [69]:
compln_yes = []
compln_no = []
c1 = df['Complain']==1
c2 = df['Complain']==0
for elm in camp_lst:
    c3 = df[elm]==1
    compln_yes.append(df.loc[(c1)& (c3)].shape[0])
    compln_no.append(df.loc[(c2)&(c3)].shape[0])
    
X_axis = np.arange(len(camp_lst)) 
fig = plt.figure(figsize =(10, 10))
plt.bar(X_axis - 0.2, compln_yes, 0.4,label = 'Complained in two years')
plt.bar(X_axis + 0.2, compln_no, 0.4, label = 'Not complained')
plt.grid(zorder = 0, axis = "y")  
plt.xticks(X_axis, camp_lst)
plt.xlabel("Campaign",fontsize = 18, labelpad=15)
plt.ylabel("Number of customers", fontsize = 18, labelpad=15)
plt.title("Number of customers accepting offer in each campaign")
plt.xticks(rotation=90)
plt.legend()
plt.show()

# What is the total expenditure in each food category of customers accepting campaign offer?

In [70]:
NonVeg = []
Other = []
Gold = []
for elm in camp_lst:
    c1 = df[elm]==1
    NonVeg.append(df[c1]['Non_Veg_Amt'].sum())
    Other.append(df[c1]['Others_Amt'].sum())
    Gold.append(df[c1]['MntGoldProds'].sum())

X_axis = np.arange(len(camp_lst)) 
width = 0.25
fig = plt.figure(figsize =(10, 10))
bar1 = plt.bar(X_axis, NonVeg, width, color = 'violet')
bar2 = plt.bar(X_axis+width, Other, width, color = 'skyblue')
bar3 = plt.bar(X_axis+width*2, Gold, width, color = 'yellow')
plt.xticks(X_axis, camp_lst)
plt.grid(zorder = 0, axis = "y")
plt.xlabel("Campaign", fontsize = 18, labelpad=15)
plt.ylabel("Amount spent by customers", fontsize = 18, labelpad=15)
plt.title("Amount spent in each food category of people accepting campaign offers")
plt.xticks(rotation=90)
plt.legend((bar1,bar2,bar3), ('Non veg amount', "Others", 'Gold amount'))
plt.show()

In [71]:
df[cond]['Education'].value_counts()

In [72]:
cond = df['Complain']==1
x = df[cond]['Education'].value_counts()
explode = (0.1,0.1,0,0,)
fig, ax = plt.subplots()
labels = 'Graduation', '2n Cycle', 'Master', 'PhD'
ax.pie( x, labels = labels, explode= explode,radius =2, textprops={'fontsize': 16}, autopct='%1.1f%%')
plt.suptitle("Education level of customers who have complained", size =20, y=1.5)
plt.show()

In [73]:
x = df[cond]['Children'].value_counts()
explode = (0.1,0.1,0,0,)
fig, ax = plt.subplots()
labels = '1', '2', '0', '3'
ax.pie( x, labels = labels, explode= explode,radius =2, textprops={'fontsize': 16}, autopct='%1.1f%%')
plt.suptitle("Distribution of no. of children amonf customers who have complained", size =20, y=1.5)
plt.show()

# Customers who have accepted offer in multiple ad campaign

# How many customers who have purchased with deals have accepted the campaign offers?

In [74]:
val = []
c2 = df['NumDealsPurchases'] != 0
for elm in camp_lst:
    cond = df[elm]==1
    val.append(df[cond][c2].shape[0])
  

In [75]:
fig = plt.figure(figsize =(10, 10)) 
plt.bar(camp_lst, val, color ='limegreen',
        width = 0.4)
plt.grid(zorder = 0, axis = "y")
plt.xlabel("Campaign rounds", fontsize = 18, labelpad=15)
plt.xticks(rotation=90)
plt.ylabel("No. of customers",fontsize = 18, labelpad=15)
plt.title("No. of customers buying with deals accepting the campaign")

plt.show()

In [76]:
df.drop('Dt_Customer',axis=1, inplace=True)
df.drop('ID', axis=1, inplace=True)
df = pd.get_dummies(df, columns=['Education'])
df.drop('Education_Basic', axis=1, inplace=True)

# Handling outliers
From the boxplot below, it is evident that there are outliers for several numerical features. Since, our dataset is not very large, we replace the outlier values using the interquartile-range method.

In [77]:
cols = ['Age','Income', 'MntGoldProds', 'Non_Veg_Amt', 'Others_Amt']
for fea in cols:
    plt.figure(figsize=(8,8))
    sns.boxplot(df[fea])
    plt.title(fea, fontsize=20)
    plt.show()

In [78]:
cols = ['Age','Income', 'MntGoldProds', 'Non_Veg_Amt', 'Others_Amt']
for elm in cols:
    q1 = df[elm].quantile(0.25)
    q3 =  df[elm].quantile(0.75)
    iqr = q3 - q1
    up_lim = q3 +1.5*iqr
    low_lim = q1 - 1.5*iqr
    cond = df[elm] > up_lim
    df.loc[cond, elm]= up_lim
    
for fea in cols:
    plt.figure(figsize=(8,8))
    sns.boxplot(df[fea])
    plt.title(fea, fontsize=20)
    plt.show()
    

# Feature scaling
Now that we have done an exhaustive analysis of the data and preprocessing,we can move towards model implementation. However, we also have to do appropriate scaling of our features in order to have better efficiency.

In [79]:
scaler = StandardScaler()
df_sc = scaler.fit_transform(df)

# Model implementation for cluster analysis
* Since the dataset has a large number of features, we intend to apply PCA and perform clustering analysis in two dimension
* Perform elbow method to determine the ideal number of clusters
* We experiment with multiple algorithms to gain a broad overview of the performance


In [80]:
pca = KernelPCA(n_components=2, gamma=0.0433, fit_inverse_transform=True)
reduced= pca.fit_transform(df_sc)
df_pca = pd.DataFrame(reduced)
inv_trns = pca.inverse_transform(reduced)

from sklearn.metrics import mean_squared_error
print('mean_squared_error of original dataset and inverse transformed dataset reduced by kernel PCA {:.2e}'.format(mean_squared_error(df_sc, inv_trns)))

In [81]:
wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(df_pca) 
    wcss.append(kmeans.inertia_)

In [82]:
plt.plot(range(1, 11), wcss)
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') 
plt.show()

From the above figure, it seems that the optimal number of clusters applicable isthree, but four seems equally plausible. As such we can test our model performace for both three and four clusters

# K-Means

In [83]:
df_dum = df_pca.copy()
for k in range(3,5):
    kmeans = KMeans(n_clusters=k)
    df_dum['Cluster Number',k] = kmeans.fit_predict(df_dum)
    val_coun = df_dum['Cluster Number',k].value_counts()
    print('Cluster distribution with ',k, 'clusters')
    print(val_coun)

# K-Medoids

In [84]:
df_dum = df_pca.copy()
for k in range(3,5):
    kmeds = KMedoids(n_clusters=k)
    df_dum['Cluster Number',k] = kmeds.fit_predict(df_dum)
    val_coun = df_dum['Cluster Number',k].value_counts()
    print('Cluster distribution with ',k, 'clusters')
    print(val_coun)

# BIRCH

In [85]:
df_dum = df_pca.copy()
for k in range(3,5):
    birch = Birch(n_clusters=k)
    df_dum['Cluster Number',k] = birch.fit_predict(df_dum)
    val_coun = df_dum['Cluster Number',k].value_counts()
    print('Cluster distribution with ',k, 'clusters')
    print(val_coun)

# Agglomerative Clustering

In [86]:
df_dum = df_pca.copy()
for k in range(3,5):
    clr = AgglomerativeClustering(n_clusters=k)
    df_dum['Cluster Number',k] = clr.fit_predict(df_dum)
    val_coun = df_dum['Cluster Number',k].value_counts()
    print('Cluster distribution with ',k, 'clusters')
    print(val_coun)


# Model evaluation
* We consider the silhouette score as the performance metric and attempt to obtain the best combination of hyperparameter values 
* The cluster distribution for a range of models is obtained in a 2D plot

In [87]:
s = []
max_s = 0
max_s_n_clusters = None
affinity = ['euclidean', 'l1','l2', 'cosine','manhattan']
linkage = ['complete','average','single']
best_aff = None
best_l = None
for aff in affinity:
    for l in linkage:
        for i in np.arange(2,8):
            hierarchical_cl = AgglomerativeClustering(n_clusters=i, affinity= aff, linkage = l)
            ypred = hierarchical_cl.fit_predict(reduced)
            sil = silhouette_score(reduced, ypred)
            if sil > max_s:
                max_s = sil
                max_s_n_clusters = np.unique(ypred)
                best_aff = aff
                best_l = l

print('Maximal silhoutte {:.3f}'.format(max_s))
print('Optimal number of clusters', len(max_s_n_clusters))
print('Optimal affinity', best_aff)
print('Optimal linkage', best_l)

# Visualization of cluster formation

In [90]:
df_pred = df_pca.copy()

df_pred['Cluster Number (AC)'] = AgglomerativeClustering(n_clusters=4).fit_predict(df_pca)
df_pred['Cluster Number (KM)'] = KMeans(n_clusters=4).fit_predict(df_pca)
df_pred['Cluster Number (KMed)'] = KMedoids(n_clusters = 4).fit_predict(df_pca)
df_pred['Cluster Number (BRC)'] = Birch(n_clusters = 4).fit_predict(df_pca)


In [91]:
fig = plt.figure(figsize=(10,10))
ax = plt.subplot(111, label="bl2")
ax.scatter(x = df_pca[0], y=df_pca[1], s=40,c = df_pred['Cluster Number (KM)'],marker='o' )
ax.set_title("Cluster distribution in KMeans clustering ")
plt.show()

In [94]:
fig = plt.figure(figsize=(10,10))
ax = plt.subplot(111, label="bl2")
ax.scatter(x = df_pca[0], y=df_pca[1], s=40,c = df_pred['Cluster Number (KMed)'],marker='o' )
ax.set_title("Cluster distribution in KMedoids clustering ")
plt.show()

In [95]:
fig = plt.figure(figsize=(10,10))
ax = plt.subplot(111, label="bl2")
ax.scatter(x = df_pca[0], y=df_pca[1], s=40,c = df_pred['Cluster Number (BRC)'],marker='o' )
ax.set_title("Cluster distribution in BIRCH clustering ")
plt.show()

In [96]:
fig = plt.figure(figsize=(10,10))
ax = plt.subplot(111, label="bl2")
ax.scatter(x = df_pca[0], y=df_pca[1], s=40,c = df_pred['Cluster Number (AC)'],marker='o' )
ax.set_title("Cluster distribution in Agglomerative clustering ")
plt.show()