# Client segmentation  | Cristiane Carneiro

Validating and describing differences across clusters

## Import libraries

In [31]:
# data
# ==============================================================================
import numpy as np
import pandas as pd

# charts
# ==============================================================================
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits import mplot3d

# pre-processing 
# ==============================================================================
from sklearn.preprocessing import StandardScaler

#from sklearn import metrics
from sklearn.cluster import KMeans, DBSCAN
from yellowbrick.cluster import KElbowVisualizer
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

#display
# ==============================================================================
pd.set_option('display.max_columns', None)

## Import dataset

In [10]:
original_data = pd.read_csv('../data/data_clean_women.csv')

In [11]:
labels = pd.read_csv('../data/clusters_women.csv')

In [12]:
original_data.head(2)

Unnamed: 0,client_id,residence_country 1,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other region 1,residence_other region 2,residence_other region 3,residence_other region 4,residence_other region 5,residence_other region 6,residence_other region 7,residence_other region 8,generation_gen silent/boomers,generation_gen x,generation_gen z/alpha,generation_millennials,category_children clothes,category_handbags,category_men clothes,category_men shoes,category_other,category_soft,category_women clothes,category_women shoes,color_cool_jewel,color_neutral_pastel,color_other,color_vivid_metallic,color_warm_earth,season_autumn,season_spring,season_summer,season_winter,total_sum
0,76154,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,4,0,1,2,0,2,4,1,0,0,5
1,292032,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,1,0,0,0,6,3,1,5,2,1,3,0,2,10,0,12


In [13]:
labels.head(2)

Unnamed: 0,client_id,cluster
0,76154,2
1,292032,2


## Transform dataset

In [14]:
data = pd.merge(original_data, labels, on='client_id')
data.head()

Unnamed: 0,client_id,residence_country 1,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other region 1,residence_other region 2,residence_other region 3,residence_other region 4,residence_other region 5,residence_other region 6,residence_other region 7,residence_other region 8,generation_gen silent/boomers,generation_gen x,generation_gen z/alpha,generation_millennials,category_children clothes,category_handbags,category_men clothes,category_men shoes,category_other,category_soft,category_women clothes,category_women shoes,color_cool_jewel,color_neutral_pastel,color_other,color_vivid_metallic,color_warm_earth,season_autumn,season_spring,season_summer,season_winter,total_sum,cluster
0,76154,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,4,0,1,2,0,2,4,1,0,0,5,2
1,292032,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,1,0,0,0,6,3,1,5,2,1,3,0,2,10,0,12,2
2,513675,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,12,2,1,1,33,2,8,18,3,5,17,6,7,8,30,51,1
3,514314,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,5,7,0,12,9,14,8,3,24,10,4,14,19,0,2,34,55,1
4,517104,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,2,0,2,2


In [15]:
#I will add proportion columns to account for the relative preference towards a given season, product category, color

In [21]:
data.columns

Index(['client_id', 'residence_country 1', 'residence_country 2',
       'residence_country 3', 'residence_country 4', 'residence_country 5',
       'residence_country 6', 'residence_country 7',
       'residence_other region 1', 'residence_other region 2',
       'residence_other region 3', 'residence_other region 4',
       'residence_other region 5', 'residence_other region 6',
       'residence_other region 7', 'residence_other region 8',
       'generation_gen silent/boomers', 'generation_gen x',
       'generation_gen z/alpha', 'generation_millennials',
       'category_children clothes', 'category_handbags',
       'category_men clothes', 'category_men shoes', 'category_other',
       'category_soft', 'category_women clothes', 'category_women shoes',
       'color_cool_jewel', 'color_neutral_pastel', 'color_other',
       'color_vivid_metallic', 'color_warm_earth', 'season_autumn',
       'season_spring', 'season_summer', 'season_winter', 'total_sum',
       'cluster', 'category

In [16]:
category_columns = [
    'category_children clothes', 'category_handbags',
       'category_men clothes', 'category_men shoes', 'category_other',
       'category_soft', 'category_women clothes', 'category_women shoes'
]

In [22]:
color_columns = ['color_cool_jewel', 'color_neutral_pastel', 'color_other',
       'color_vivid_metallic', 'color_warm_earth'
]

In [18]:
season_columns = ['season_autumn', 'season_spring', 'season_summer',
       'season_winter'
]

In [19]:
for col in category_columns:
    data[f'{col}_prop'] = data[col] / data['total_sum']

In [23]:
for col in color_columns:
    data[f'{col}_prop'] = data[col] / data['total_sum']

In [24]:
for col in season_columns:
    data[f'{col}_prop'] = data[col] / data['total_sum']

In [25]:
data.head()

Unnamed: 0,client_id,residence_country 1,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other region 1,residence_other region 2,residence_other region 3,residence_other region 4,residence_other region 5,residence_other region 6,residence_other region 7,residence_other region 8,generation_gen silent/boomers,generation_gen x,generation_gen z/alpha,generation_millennials,category_children clothes,category_handbags,category_men clothes,category_men shoes,category_other,category_soft,category_women clothes,category_women shoes,color_cool_jewel,color_neutral_pastel,color_other,color_vivid_metallic,color_warm_earth,season_autumn,season_spring,season_summer,season_winter,total_sum,cluster,category_children clothes_prop,category_handbags_prop,category_men clothes_prop,category_men shoes_prop,category_other_prop,category_soft_prop,category_women clothes_prop,category_women shoes_prop,color_cool_jewel_prop,color_neutral_pastel_prop,color_other_prop,color_vivid_metallic_prop,color_warm_earth_prop,season_autumn_prop,season_spring_prop,season_summer_prop,season_winter_prop
0,76154,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,4,0,1,2,0,2,4,1,0,0,5,2,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.8,0.0,0.2,0.4,0.0,0.4,0.8,0.2,0.0,0.0
1,292032,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,1,0,0,0,6,3,1,5,2,1,3,0,2,10,0,12,2,0.0,0.166667,0.083333,0.0,0.0,0.0,0.5,0.25,0.083333,0.416667,0.166667,0.083333,0.25,0.0,0.166667,0.833333,0.0
2,513675,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,12,2,1,1,33,2,8,18,3,5,17,6,7,8,30,51,1,0.0,0.0,0.235294,0.039216,0.019608,0.019608,0.647059,0.039216,0.156863,0.352941,0.058824,0.098039,0.333333,0.117647,0.137255,0.156863,0.588235
3,514314,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,5,7,0,12,9,14,8,3,24,10,4,14,19,0,2,34,55,1,0.0,0.090909,0.127273,0.0,0.218182,0.163636,0.254545,0.145455,0.054545,0.436364,0.181818,0.072727,0.254545,0.345455,0.0,0.036364,0.618182
4,517104,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,2,0,2,2,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.0,0.0,1.0,0.0


## Averages by cluster

In [26]:
columns_to_exclude = ['client_id', 'cluster']

In [27]:
columns_to_average = [col for col in data.columns if col not in columns_to_exclude]

In [28]:
averages = data.groupby('cluster')[columns_to_average].mean().reset_index()

In [29]:
averages

Unnamed: 0,cluster,residence_country 1,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other region 1,residence_other region 2,residence_other region 3,residence_other region 4,residence_other region 5,residence_other region 6,residence_other region 7,residence_other region 8,generation_gen silent/boomers,generation_gen x,generation_gen z/alpha,generation_millennials,category_children clothes,category_handbags,category_men clothes,category_men shoes,category_other,category_soft,category_women clothes,category_women shoes,color_cool_jewel,color_neutral_pastel,color_other,color_vivid_metallic,color_warm_earth,season_autumn,season_spring,season_summer,season_winter,total_sum,category_children clothes_prop,category_handbags_prop,category_men clothes_prop,category_men shoes_prop,category_other_prop,category_soft_prop,category_women clothes_prop,category_women shoes_prop,color_cool_jewel_prop,color_neutral_pastel_prop,color_other_prop,color_vivid_metallic_prop,color_warm_earth_prop,season_autumn_prop,season_spring_prop,season_summer_prop,season_winter_prop
0,0,0.484733,0.083588,0.085878,0.025573,0.032061,0.041221,0.057252,0.029771,0.002672,0.013359,0.066031,0.004962,0.041603,0.023282,0.008015,0.14542,0.442366,0.00687,0.405344,0.994275,2.062977,2.965267,1.291603,6.936641,1.126336,7.590458,3.985115,3.038168,9.662977,5.572901,1.509542,7.169084,6.726336,6.874427,6.375954,6.975954,26.952672,0.033561,0.079695,0.109785,0.049574,0.259825,0.041904,0.274112,0.151544,0.11192,0.35883,0.210344,0.056433,0.262474,0.249243,0.253561,0.238356,0.25884
1,1,0.438475,0.07799,0.063258,0.033795,0.046794,0.060659,0.045061,0.031196,0.004333,0.016464,0.078856,0.0026,0.039861,0.042461,0.018198,0.147314,0.457539,0.007799,0.387348,2.254766,2.886482,6.830156,2.344887,11.325823,2.674177,17.481802,7.519064,6.714038,17.993934,10.192374,3.159445,15.257366,12.838821,13.553726,13.169844,13.754766,53.317158,0.040142,0.054257,0.129444,0.045266,0.211101,0.050315,0.327581,0.141893,0.126212,0.337452,0.190978,0.059597,0.285761,0.241245,0.254062,0.247762,0.256931
2,2,0.5961,0.069964,0.035786,0.030961,0.03659,0.015279,0.058303,0.020708,0.000402,0.006232,0.04624,0.003217,0.047246,0.026136,0.006836,0.112988,0.420788,0.00764,0.458585,0.230599,0.719743,0.895054,0.418577,1.949538,0.315641,2.648774,1.362083,0.935464,3.237435,1.74045,0.481102,2.145557,2.311821,2.097708,2.120627,2.009851,8.540008,0.024001,0.094602,0.098298,0.047786,0.224949,0.035608,0.316405,0.158351,0.10649,0.383612,0.208274,0.057939,0.243684,0.280859,0.245694,0.249594,0.223853


## ANOVA

In [30]:
p_values = {}

for col in data.columns[1:]:  # Exclude the 'client_id' column
    clusters = [data[data['cluster'] == i][col] for i in range(0, 3)]  # Assuming clusters are labeled 1, 2, 3
    f_stat, p_value = f_oneway(*clusters)
    p_values[col] = p_value

# Display all p-values for variables
print("All p-values for variables:")
for var, p_val in p_values.items():
    print(f"Variable: {var}, p-value: {p_val}")

# Collect variables with p-value < 0.05 into a list
significant_variables = [var for var, p_val in p_values.items() if p_val < 0.05]

# Display variables with p-value < 0.05
print("\nVariables with p-value < 0.05:")
print(significant_variables)

All p-values for variables:
Variable: residence_country 1, p-value: 4.8362453266254477e-32
Variable: residence_country 2, p-value: 0.09326052072778222
Variable: residence_country 3, p-value: 2.00794207949858e-19
Variable: residence_country 4, p-value: 0.28756340210250797
Variable: residence_country 5, p-value: 0.08484379507471992
Variable: residence_country 6, p-value: 5.949330615581672e-20
Variable: residence_country 7, p-value: 0.2053453468982172
Variable: residence_other region 1, p-value: 0.017665453544437
Variable: residence_other region 2, p-value: 0.0028113509974029127
Variable: residence_other region 3, p-value: 0.0004627574365942142
Variable: residence_other region 4, p-value: 3.4659987351744296e-06
Variable: residence_other region 5, p-value: 0.39811329042533894
Variable: residence_other region 6, p-value: 0.37200323475052544
Variable: residence_other region 7, p-value: 0.0027715549790692192
Variable: residence_other region 8, p-value: 0.0008072323282469208
Variable: generati

## Tukey HSD (Honestly Significant Difference) 

In [39]:
significant_variables = ['residence_country 1', 'residence_country 3', 'residence_country 6', 'residence_other region 1', 'residence_other region 2', 'residence_other region 3', 'residence_other region 4', 'residence_other region 7', 'residence_other region 8', 'generation_gen silent/boomers', 'generation_gen x', 'generation_millennials', 'category_children clothes', 'category_handbags', 'category_men clothes', 'category_men shoes', 'category_other', 'category_soft', 'category_women clothes', 'category_women shoes', 'color_cool_jewel', 'color_neutral_pastel', 'color_other', 'color_vivid_metallic', 'color_warm_earth', 'season_autumn', 'season_spring', 'season_summer', 'season_winter', 'total_sum', 'cluster', 'category_children clothes_prop', 'category_handbags_prop', 'category_men clothes_prop', 'category_other_prop', 'category_soft_prop', 'category_women clothes_prop', 'category_women shoes_prop', 'color_cool_jewel_prop', 'color_neutral_pastel_prop', 'color_other_prop', 'color_warm_earth_prop', 'season_autumn_prop', 'season_winter_prop']

In [43]:
for var in significant_variables:
    tukey_results = pairwise_tukeyhsd(data[var], data['cluster'])

    print(f"Tukey HSD results for variable '{var}':")
    print(tukey_results)

Tukey HSD results for variable 'residence_country 1':
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower   upper  reject
---------------------------------------------------
     0      1  -0.0463 0.022 -0.0872 -0.0053   True
     0      2   0.1114   0.0  0.0834  0.1393   True
     1      2   0.1576   0.0  0.1198  0.1955   True
---------------------------------------------------
Tukey HSD results for variable 'residence_country 3':
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
     0      1  -0.0226 0.0127 -0.0413 -0.0039   True
     0      2  -0.0501    0.0 -0.0629 -0.0373   True
     1      2  -0.0275 0.0006 -0.0448 -0.0102   True
----------------------------------------------------
Tukey HSD results for variable 'residence_country 6':
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reje

Tukey HSD results for variable 'category_women clothes':
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj  lower    upper   reject
-----------------------------------------------------
     0      1   9.8913   0.0   9.3502  10.4325   True
     0      2  -4.9417   0.0  -5.3114  -4.5719   True
     1      2  -14.833   0.0 -15.3335 -14.3326   True
-----------------------------------------------------
Tukey HSD results for variable 'category_women shoes':
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower   upper  reject
---------------------------------------------------
     0      1   3.5339   0.0  3.2868  3.7811   True
     0      2   -2.623   0.0 -2.7919 -2.4542   True
     1      2   -6.157   0.0 -6.3856 -5.9284   True
---------------------------------------------------
Tukey HSD results for variable 'color_cool_jewel':
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower   upper

Tukey HSD results for variable 'category_women clothes_prop':
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     0      1   0.0535    0.0  0.0296 0.0773   True
     0      2   0.0423    0.0   0.026 0.0586   True
     1      2  -0.0112 0.4601 -0.0332 0.0109  False
---------------------------------------------------
Tukey HSD results for variable 'category_women shoes_prop':
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     0      1  -0.0097  0.229 -0.0234 0.0041  False
     0      2   0.0068 0.2079 -0.0026 0.0162  False
     1      2   0.0165 0.0071  0.0037 0.0292   True
---------------------------------------------------
Tukey HSD results for variable 'color_cool_jewel_prop':
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   up