# Client segmentation  | Cristiane Carneiro

Validating and describing differences across clusters

## Import libraries

In [23]:
# data
# ==============================================================================
import numpy as np
import pandas as pd

# charts
# ==============================================================================
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits import mplot3d

# pre-processing 
# ==============================================================================
from sklearn.preprocessing import StandardScaler

#from sklearn import metrics
from sklearn.cluster import KMeans, DBSCAN
from yellowbrick.cluster import KElbowVisualizer
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

#display
# ==============================================================================
pd.set_option('display.max_columns', None)

#stats
# ==============================================================================
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

## Import dataset

In [2]:
original_data = pd.read_csv('../data/data_clean_men.csv')

In [3]:
labels = pd.read_csv('../data/clusters_men.csv')

In [4]:
original_data.head(2)

Unnamed: 0,client_id,residence_country 1,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other region 1,residence_other region 2,residence_other region 3,residence_other region 4,residence_other region 5,residence_other region 6,residence_other region 7,residence_other region 8,generation_gen silent/boomers,generation_gen x,generation_gen z/alpha,generation_millennials,category_children clothes,category_handbags,category_men clothes,category_men shoes,category_other,category_soft,category_women clothes,category_women shoes,color_cool_jewel,color_neutral_pastel,color_other,color_vivid_metallic,color_warm_earth,season_autumn,season_spring,season_summer,season_winter,total_sum
0,124710,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2,3,1,0,2,4,2,6,1,0,3,0,2,0,10,12
1,382440,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,2,0,0,1,0,0,3,1,1,0,0,0,4,1,5


In [5]:
labels.head(2)

Unnamed: 0,client_id,cluster
0,124710,0
1,382440,0


## Transform dataset

In [6]:
data = pd.merge(original_data, labels, on='client_id')
data.head()

Unnamed: 0,client_id,residence_country 1,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other region 1,residence_other region 2,residence_other region 3,residence_other region 4,residence_other region 5,residence_other region 6,residence_other region 7,residence_other region 8,generation_gen silent/boomers,generation_gen x,generation_gen z/alpha,generation_millennials,category_children clothes,category_handbags,category_men clothes,category_men shoes,category_other,category_soft,category_women clothes,category_women shoes,color_cool_jewel,color_neutral_pastel,color_other,color_vivid_metallic,color_warm_earth,season_autumn,season_spring,season_summer,season_winter,total_sum,cluster
0,124710,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2,3,1,0,2,4,2,6,1,0,3,0,2,0,10,12,0
1,382440,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,2,0,0,1,0,0,3,1,1,0,0,0,4,1,5,0
2,1311915,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,4,3,2,0,0,2,3,6,1,2,1,7,1,3,2,13,0
3,1958756,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,2,6,6,25,5,1,10,1,25,14,2,13,7,20,16,12,55,2
4,1958794,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,3,10,4,5,1,1,3,3,8,5,2,9,9,9,6,3,27,1


In [7]:
#I will add proportion columns to account for the relative preference towards a given season, product category, color

In [8]:
data.columns

Index(['client_id', 'residence_country 1', 'residence_country 2',
       'residence_country 3', 'residence_country 4', 'residence_country 5',
       'residence_country 6', 'residence_country 7',
       'residence_other region 1', 'residence_other region 2',
       'residence_other region 3', 'residence_other region 4',
       'residence_other region 5', 'residence_other region 6',
       'residence_other region 7', 'residence_other region 8',
       'generation_gen silent/boomers', 'generation_gen x',
       'generation_gen z/alpha', 'generation_millennials',
       'category_children clothes', 'category_handbags',
       'category_men clothes', 'category_men shoes', 'category_other',
       'category_soft', 'category_women clothes', 'category_women shoes',
       'color_cool_jewel', 'color_neutral_pastel', 'color_other',
       'color_vivid_metallic', 'color_warm_earth', 'season_autumn',
       'season_spring', 'season_summer', 'season_winter', 'total_sum',
       'cluster'],
      dt

In [9]:
category_columns = [
    'category_children clothes', 'category_handbags',
       'category_men clothes', 'category_men shoes', 'category_other',
       'category_soft', 'category_women clothes', 'category_women shoes'
]

In [10]:
color_columns = ['color_cool_jewel', 'color_neutral_pastel', 'color_other',
       'color_vivid_metallic', 'color_warm_earth'
]

In [11]:
season_columns = ['season_autumn', 'season_spring', 'season_summer',
       'season_winter'
]

In [12]:
for col in category_columns:
    data[f'{col}_prop'] = data[col] / data['total_sum']

In [13]:
for col in color_columns:
    data[f'{col}_prop'] = data[col] / data['total_sum']

In [14]:
for col in season_columns:
    data[f'{col}_prop'] = data[col] / data['total_sum']

In [15]:
data.head()

Unnamed: 0,client_id,residence_country 1,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other region 1,residence_other region 2,residence_other region 3,residence_other region 4,residence_other region 5,residence_other region 6,residence_other region 7,residence_other region 8,generation_gen silent/boomers,generation_gen x,generation_gen z/alpha,generation_millennials,category_children clothes,category_handbags,category_men clothes,category_men shoes,category_other,category_soft,category_women clothes,category_women shoes,color_cool_jewel,color_neutral_pastel,color_other,color_vivid_metallic,color_warm_earth,season_autumn,season_spring,season_summer,season_winter,total_sum,cluster,category_children clothes_prop,category_handbags_prop,category_men clothes_prop,category_men shoes_prop,category_other_prop,category_soft_prop,category_women clothes_prop,category_women shoes_prop,color_cool_jewel_prop,color_neutral_pastel_prop,color_other_prop,color_vivid_metallic_prop,color_warm_earth_prop,season_autumn_prop,season_spring_prop,season_summer_prop,season_winter_prop
0,124710,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2,3,1,0,2,4,2,6,1,0,3,0,2,0,10,12,0,0.0,0.0,0.166667,0.25,0.083333,0.0,0.166667,0.333333,0.166667,0.5,0.083333,0.0,0.25,0.0,0.166667,0.0,0.833333
1,382440,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,2,0,0,1,0,0,3,1,1,0,0,0,4,1,5,0,0.0,0.0,0.4,0.4,0.0,0.0,0.2,0.0,0.0,0.6,0.2,0.2,0.0,0.0,0.0,0.8,0.2
2,1311915,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,4,3,2,0,0,2,3,6,1,2,1,7,1,3,2,13,0,0.0,0.153846,0.307692,0.230769,0.153846,0.0,0.0,0.153846,0.230769,0.461538,0.076923,0.153846,0.076923,0.538462,0.076923,0.230769,0.153846
3,1958756,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,2,6,6,25,5,1,10,1,25,14,2,13,7,20,16,12,55,2,0.0,0.036364,0.109091,0.109091,0.454545,0.090909,0.018182,0.181818,0.018182,0.454545,0.254545,0.036364,0.236364,0.127273,0.363636,0.290909,0.218182
4,1958794,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,3,10,4,5,1,1,3,3,8,5,2,9,9,9,6,3,27,1,0.0,0.111111,0.37037,0.148148,0.185185,0.037037,0.037037,0.111111,0.111111,0.296296,0.185185,0.074074,0.333333,0.333333,0.333333,0.222222,0.111111


## Averages by cluster

In [16]:
columns_to_exclude = ['client_id', 'cluster']

In [17]:
columns_to_average = [col for col in data.columns if col not in columns_to_exclude]

In [18]:
averages = data.groupby('cluster')[columns_to_average].mean().reset_index()

In [19]:
averages

Unnamed: 0,cluster,residence_country 1,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other region 1,residence_other region 2,residence_other region 3,residence_other region 4,residence_other region 5,residence_other region 6,residence_other region 7,residence_other region 8,generation_gen silent/boomers,generation_gen x,generation_gen z/alpha,generation_millennials,category_children clothes,category_handbags,category_men clothes,category_men shoes,category_other,category_soft,category_women clothes,category_women shoes,color_cool_jewel,color_neutral_pastel,color_other,color_vivid_metallic,color_warm_earth,season_autumn,season_spring,season_summer,season_winter,total_sum,category_children clothes_prop,category_handbags_prop,category_men clothes_prop,category_men shoes_prop,category_other_prop,category_soft_prop,category_women clothes_prop,category_women shoes_prop,color_cool_jewel_prop,color_neutral_pastel_prop,color_other_prop,color_vivid_metallic_prop,color_warm_earth_prop,season_autumn_prop,season_spring_prop,season_summer_prop,season_winter_prop
0,0,0.60663,0.1,0.064088,0.022652,0.014365,0.01105,0.030387,0.019337,0.0,0.007735,0.052486,0.012155,0.035912,0.01989,0.003315,0.099448,0.421547,0.005525,0.473481,0.180663,0.748619,2.247514,1.139779,2.066851,0.264088,0.930387,0.823757,0.960221,3.274586,1.810497,0.360773,1.99558,2.174033,2.080663,2.110497,2.036464,8.401657,0.017112,0.106488,0.244649,0.133937,0.241139,0.030165,0.124714,0.101797,0.105191,0.395448,0.2213,0.044012,0.234049,0.271149,0.246726,0.248045,0.23408
1,1,0.479869,0.11099,0.14037,0.025027,0.015234,0.009793,0.032644,0.03482,0.001088,0.009793,0.067465,0.01197,0.025027,0.027203,0.008705,0.117519,0.431991,0.006529,0.443961,0.64309,1.786725,7.408052,3.479869,6.352557,0.963003,3.023939,2.866159,3.126224,9.683351,5.726877,1.180631,6.806311,6.408052,6.586507,6.492927,7.035909,26.523395,0.022745,0.069462,0.273611,0.133301,0.243988,0.036972,0.10947,0.110451,0.116673,0.364544,0.219364,0.045089,0.25433,0.241763,0.247948,0.242628,0.26766
2,2,0.284337,0.125301,0.175904,0.021687,0.026506,0.053012,0.040964,0.036145,0.004819,0.021687,0.106024,0.014458,0.028916,0.040964,0.019277,0.156627,0.433735,0.016867,0.392771,2.062651,2.262651,17.677108,6.337349,12.527711,2.959036,4.944578,4.139759,7.250602,17.891566,11.050602,2.518072,14.2,13.301205,12.746988,12.612048,14.250602,52.910843,0.040787,0.042951,0.333596,0.119939,0.237595,0.056328,0.090532,0.078272,0.137966,0.336583,0.209447,0.048133,0.267871,0.253841,0.240446,0.237084,0.268628


## ANOVA

In [24]:
p_values = {}

for col in data.columns[1:]:  # Exclude the 'client_id' column
    clusters = [data[data['cluster'] == i][col] for i in range(0, 3)]  # Assuming clusters are labeled 1, 2, 3
    f_stat, p_value = f_oneway(*clusters)
    p_values[col] = p_value

# Display all p-values for variables
print("All p-values for variables:")
for var, p_val in p_values.items():
    print(f"Variable: {var}, p-value: {p_val}")

# Collect variables with p-value < 0.05 into a list
significant_variables = [var for var, p_val in p_values.items() if p_val < 0.05]

# Display variables with p-value < 0.05
print("\nVariables with p-value < 0.05:")
print(significant_variables)

All p-values for variables:
Variable: residence_country 1, p-value: 1.3810500342931922e-34
Variable: residence_country 2, p-value: 0.2811989009624169
Variable: residence_country 3, p-value: 9.928562364927457e-16
Variable: residence_country 4, p-value: 0.9044189929708314
Variable: residence_country 5, p-value: 0.20225721478370642
Variable: residence_country 6, p-value: 1.355373278434967e-09
Variable: residence_country 7, p-value: 0.5477269622710568
Variable: residence_other region 1, p-value: 0.02170088141648391
Variable: residence_other region 2, p-value: 0.01614406041081235
Variable: residence_other region 3, p-value: 0.03795763732395779
Variable: residence_other region 4, p-value: 0.0002654892796120802
Variable: residence_other region 5, p-value: 0.9203127108652209
Variable: residence_other region 6, p-value: 0.2903633049996571
Variable: residence_other region 7, p-value: 0.03866692224087804
Variable: residence_other region 8, p-value: 0.0015504515498720244
Variable: generation_gen s

## Tukey HSD (Honestly Significant Difference) 

In [25]:
significant_variables = ['residence_country 1', 'residence_country 3', 'residence_country 6', 'residence_other region 1', 'residence_other region 2', 'residence_other region 3', 'residence_other region 4', 'residence_other region 7', 'residence_other region 8', 'generation_gen silent/boomers', 'generation_gen z/alpha', 'generation_millennials', 'category_children clothes', 'category_handbags', 'category_men clothes', 'category_men shoes', 'category_other', 'category_soft', 'category_women clothes', 'category_women shoes', 'color_cool_jewel', 'color_neutral_pastel', 'color_other', 'color_vivid_metallic', 'color_warm_earth', 'season_autumn', 'season_spring', 'season_summer', 'season_winter', 'total_sum', 'cluster', 'category_children clothes_prop', 'category_handbags_prop', 'category_men clothes_prop', 'category_soft_prop', 'category_women clothes_prop', 'category_women shoes_prop', 'color_cool_jewel_prop', 'color_neutral_pastel_prop', 'color_warm_earth_prop', 'season_autumn_prop', 'season_winter_prop']

In [26]:
for var in significant_variables:
    tukey_results = pairwise_tukeyhsd(data[var], data['cluster'])

    print(f"Tukey HSD results for variable '{var}':")
    print(tukey_results)

Tukey HSD results for variable 'residence_country 1':
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower   upper  reject
---------------------------------------------------
     0      1  -0.1268   0.0  -0.173 -0.0805   True
     0      2  -0.3223   0.0 -0.3845 -0.2601   True
     1      2  -0.1955   0.0 -0.2631  -0.128   True
---------------------------------------------------
Tukey HSD results for variable 'residence_country 3':
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     0      1   0.0763    0.0  0.0479 0.1046   True
     0      2   0.1118    0.0  0.0737 0.1499   True
     1      2   0.0355 0.1091 -0.0058 0.0769  False
---------------------------------------------------
Tukey HSD results for variable 'residence_country 6':
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
------

Tukey HSD results for variable 'category_women clothes':
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj lower  upper  reject
-------------------------------------------------
     0      1   2.0936   0.0  1.713 2.4741   True
     0      2   4.0142   0.0 3.5029 4.5255   True
     1      2   1.9206   0.0  1.365 2.4763   True
-------------------------------------------------
Tukey HSD results for variable 'category_women shoes':
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj lower  upper  reject
-------------------------------------------------
     0      1   2.0424   0.0 1.8059 2.2789   True
     0      2    3.316   0.0 2.9982 3.6338   True
     1      2   1.2736   0.0 0.9282  1.619   True
-------------------------------------------------
Tukey HSD results for variable 'color_cool_jewel':
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj lower  upper  reject
-------------------------------

Tukey HSD results for variable 'category_women shoes_prop':
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
     0      1   0.0087  0.272 -0.0045  0.0218  False
     0      2  -0.0235 0.0052 -0.0412 -0.0058   True
     1      2  -0.0322 0.0003 -0.0514 -0.0129   True
----------------------------------------------------
Tukey HSD results for variable 'color_cool_jewel_prop':
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     0      1   0.0115 0.0572 -0.0003 0.0232  False
     0      2   0.0328    0.0   0.017 0.0486   True
     1      2   0.0213 0.0102  0.0041 0.0385   True
---------------------------------------------------
Tukey HSD results for variable 'color_neutral_pastel_prop':
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower