In [36]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

### **Customer Segmentation**

This project aims at the segmentation of customers into different groups using unsupervised learning algorithms. These algorithms include the following;
- Kmeans
- DBSCAN
- Agglomerative Clustering/Hierarchical clustering

For this approach, the Agglomerative clustering would be used instead of the others.

In [37]:
data = pd.read_csv("../data/segmentation_data.csv")

Kindly refer to the segementation data legend for the interpretation of the numeric values.

In [38]:
data.sample(n=5,random_state=42)

Unnamed: 0,ID,Sex,Marital status,Age,Education,Income,Occupation,Settlement size
1860,100001861,1,1,43,1,48632,0,0
353,100000354,0,0,28,0,141847,1,1
1333,100001334,1,1,48,2,116235,0,0
905,100000906,0,0,20,0,116582,2,2
1289,100001290,0,0,49,1,118571,2,2


The ID column nont needed and will dropped, this will in turn save memory

In [39]:
data.columns

Index(['ID', 'Sex', 'Marital status', 'Age', 'Education', 'Income',
       'Occupation', 'Settlement size'],
      dtype='object')

In [40]:
data.drop('ID',axis=1,inplace=True)
data.columns

Index(['Sex', 'Marital status', 'Age', 'Education', 'Income', 'Occupation',
       'Settlement size'],
      dtype='object')

In [41]:
data.shape

(2000, 7)

As shown show in the data description, there are 2,000 data points with 7 columns(ID column dropped).

### **Preprocessing & EDA**

**Rename columns**

In [42]:
data_prep = data

In [43]:
data_prep.columns = data_prep.columns.str.lower().str.replace(" ","_")
data_prep.columns

Index(['sex', 'marital_status', 'age', 'education', 'income', 'occupation',
       'settlement_size'],
      dtype='object')

**Check for NaN values & duplicated values**

In [44]:
data_prep.isna().sum()

sex                0
marital_status     0
age                0
education          0
income             0
occupation         0
settlement_size    0
dtype: int64

In [45]:
data_prep.duplicated().sum()

0

**Check memory usage and descriptive statistics**


| Data-Type | Precision |
| ----------- | ----------- |
float16   | 3
float32   | 6
float64   | 15
float128  | 18

--------------------------------- 


|Data type |min|max|
| ----------- | ----------- |----------- |
|int8|-128|127|
|int16|-32768|32767|
|int32|-2147483648|2147483647|
|int64|-9223372036854775808|9223372036854775807|

In [46]:
data_prep.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   sex              2000 non-null   int64
 1   marital_status   2000 non-null   int64
 2   age              2000 non-null   int64
 3   education        2000 non-null   int64
 4   income           2000 non-null   int64
 5   occupation       2000 non-null   int64
 6   settlement_size  2000 non-null   int64
dtypes: int64(7)
memory usage: 109.5 KB


occupation, education, settle_size, marital status and sex should all be categorical data points

In [50]:
cat = ['sex', 'marital_status', 'education', 'occupation',
       'settlement_size']

for col in cat:
    data_prep[col] = data_prep[col].astype('category')

In [51]:
data_prep.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   sex              2000 non-null   category
 1   marital_status   2000 non-null   category
 2   age              2000 non-null   int64   
 3   education        2000 non-null   category
 4   income           2000 non-null   int64   
 5   occupation       2000 non-null   category
 6   settlement_size  2000 non-null   category
dtypes: category(5), int64(2)
memory usage: 41.8 KB


Memory size has dropped from 109.5 KB to 41.8 KB

In [62]:
data_prep['age'] = data_prep['age'].astype(np.int8)
data_prep['income'] = data_prep['income'].astype(np.int32)

In [64]:
data_prep.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   sex              2000 non-null   category
 1   marital_status   2000 non-null   category
 2   age              2000 non-null   int8    
 3   education        2000 non-null   category
 4   income           2000 non-null   int32   
 5   occupation       2000 non-null   category
 6   settlement_size  2000 non-null   category
dtypes: category(5), int32(1), int8(1)
memory usage: 20.4 KB


Memory size has dropped from 41.8 KB to 20.4 KB

In [65]:
data_prep.describe(include=np.number)

Unnamed: 0,age,income
count,2000.0,2000.0
mean,35.909,120954.419
std,11.719402,38108.824679
min,18.0,35832.0
25%,27.0,97663.25
50%,33.0,115548.5
75%,42.0,138072.25
max,76.0,309364.0


**Observations**
- Mean age is 36, having a mean salar of 120954
- There is minimum deviation inn age, meaning less variance in the dat.
- Median age is 33 years old
- Oldest person is 76 years old.

In [66]:
data_prep.describe(include='category')

Unnamed: 0,sex,marital_status,education,occupation,settlement_size
count,2000,2000,2000,2000,2000
unique,2,2,4,3,3
top,0,0,1,1,0
freq,1086,1007,1386,1113,989


**Observations**
- There are more males than females in this dataset
- There are more single people that the non-single people
- There are more high scool students than the rest.
- There are more skilled employee/ official people than the rest
- More people come from small cities than mid-sized and big cities

In [68]:
eda = data_prep

In [74]:
print(eda.select_dtypes(include=pd.CategoricalDtype).columns)
print(eda.select_dtypes(include=pd.CategoricalDtype).columns.value_counts().sum())

Index(['sex', 'marital_status', 'education', 'occupation', 'settlement_size'], dtype='object')
5


In [73]:
num_list = eda.select_dtypes(exclude=pd.CategoricalDtype).columns.to_list()
num_list

['age', 'income']

In [90]:
def range_cal(arr):
    return arr.max() - arr.min()

In [91]:
def summary_stats(group:str,column):
    try:
        if eda[column].dtype !=pd.CategoricalDtype:
            group_data = (
                eda.groupby(group)[column].agg(
                    [
                        (f"total_{column}",'sum'),
                        (f'average_{column}','mean'),
                        (f'deviation_{column}','std'),
                        (f"range_{column}",range_cal),
                        (f"skewness_level_{column}","skew")
                    ]
                ).reset_index()
            )
            return group_data
        else:
            group_data = (
                eda.groupby(group)[column].agg(
                    [
                        (f"count_total_{column}",'count')
                    ]
                ).reset_index()
            )
            return group_data
    except KeyError:
        print(f"This is the list of keys: {eda.columns}")

**Looking into sex**

In [92]:
cats = ['sex', 'marital_status', 'education', 'occupation', 'settlement_size']
sex = summary_stats(column=num_list[0],group=cats[0])
sex

Unnamed: 0,sex,total_age,average_age,deviation_age,range_age,skewness_level_age
0,0,41132.0,37.87477,11.549013,57,0.856399
1,1,30686.0,33.573304,11.495586,58,1.310032


In [None]:
cats = ['sex', 'marital_status', 'education', 'occupation', 'settlement_size']
sex = summary_stats(column=num_list[1],group=cats[0])
sex

Unnamed: 0,sex,total_income,average_income,deviation_income,range_income,skewness_level_income
0,0,138763895,127775.225599,39821.354629,243563,1.062436
1,1,103144943,112850.047046,34266.333929,273532,1.37879


**Observations**

**Looking into marital_status**

In [93]:
marital_status = summary_stats(column=num_list[0],group=cats[1])
marital_status

Unnamed: 0,marital_status,total_age,average_age,deviation_age,range_age,skewness_level_age
0,0,38658.0,38.389275,11.326854,56,0.851805
1,1,33160.0,33.393756,11.579278,58,1.331932


In [None]:
marital_status = summary_stats(column=num_list[1],group=cats[1])
marital_status

Unnamed: 0,marital_status,total_income,average_income,deviation_income,range_income,skewness_level_income
0,0,124602386,123736.232373,39370.686617,250487,1.132365
1,1,117306452,118133.3857,36589.29515,273532,1.258713


**Observations**

**Looking into education**

In [94]:
education = summary_stats(column=num_list[0],group=cats[2])
education

Unnamed: 0,education,total_age,average_age,deviation_age,range_age,skewness_level_age
0,0,7866.0,27.407666,3.235134,13,-0.219549
1,1,46211.0,33.34127,8.387785,40,0.518733
2,2,15689.0,53.914089,8.965615,37,0.144253
3,3,2052.0,57.0,17.381435,51,-0.821601


In [None]:
education = summary_stats(column=num_list[1],group=cats[2])
education

Unnamed: 0,education,total_income,average_income,deviation_income,range_income,skewness_level_income
0,0,33024577,115068.212544,40058.777044,211514,0.988842
1,1,161285660,116367.720058,32636.570352,272659,0.950019
2,2,42313141,145405.982818,48501.606672,257482,1.181768
3,3,5285460,146818.333333,37635.665122,144877,-0.303424


**Observations**

**Looking into occupation**

In [95]:
occupation = summary_stats(column=num_list[0],group=cats[3])
occupation

Unnamed: 0,occupation,total_age,average_age,deviation_age,range_age,skewness_level_age
0,0,22387.0,35.366509,11.342131,55,0.972772
1,1,39032.0,35.069182,11.424119,53,1.019542
2,2,10399.0,40.940945,12.686259,56,1.007765


In [None]:
occupation = summary_stats(column=num_list[1],group=cats[3])
occupation

Unnamed: 0,occupation,total_income,average_income,deviation_income,range_income,skewness_level_income
0,0,57499968,90837.232227,23943.549449,145171,0.630009
1,1,139653089,125474.473495,24745.006378,185942,1.488324
2,2,44755781,176203.862205,43903.518363,218864,0.882357


**Observations**

**Looking into settlement_size**

In [96]:
settlement_size = summary_stats(column=num_list[0],group=cats[4])
settlement_size

Unnamed: 0,settlement_size,total_age,average_age,deviation_age,range_age,skewness_level_age
0,0,33920.0,34.29727,10.709856,56,1.113176
1,1,20443.0,37.579044,12.853087,58,0.884697
2,2,17455.0,37.376874,11.939554,52,0.870938


In [101]:
settlement_size = summary_stats(column=num_list[1],group=cats[4])
settlement_size

Unnamed: 0,settlement_size,total_income,average_income,deviation_income,range_income,skewness_level_income
0,0,99950108,101061.787664,26505.919898,183487,0.547405
1,1,74762107,137430.34375,34579.105619,226093,1.473514
2,2,67196623,143889.985011,40781.263863,239877,1.42699


**Observations**