# Brand Co. Client Segmentation

Validating differences across clusters through ANOVA and Tukey HSD

## Import libraries

In [23]:
# data
# ==============================================================================
import numpy as np
import pandas as pd

# charts
# ==============================================================================
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits import mplot3d

# pre-processing 
# ==============================================================================
from sklearn.preprocessing import StandardScaler

#from sklearn import metrics
from sklearn.cluster import KMeans, DBSCAN
from yellowbrick.cluster import KElbowVisualizer
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

#display
# ==============================================================================
pd.set_option('display.max_columns', None)

#stats
# ==============================================================================
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

## Import dataset

In [24]:
original_data = pd.read_csv('../data/clean/data_clean_men.csv')

In [25]:
labels = pd.read_csv('../data/clusters/clusters_men.csv')

In [26]:
original_data.head(2)

Unnamed: 0,client_id,residence_country 1,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other region 1,residence_other region 2,residence_other region 3,residence_other region 4,residence_other region 5,residence_other region 6,residence_other region 7,residence_other region 8,generation_gen silent/boomers,generation_gen x,generation_gen z/alpha,generation_millennials,category_children clothes,category_handbags,category_men clothes,category_men shoes,category_other,category_soft,category_women clothes,category_women shoes,color_cool_jewel,color_neutral_pastel,color_other,color_vivid_metallic,color_warm_earth,season_autumn,season_spring,season_summer,season_winter,total_sum
0,124710,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2,3,1,0,2,4,2,6,1,0,3,0,2,0,10,12
1,382440,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,2,0,0,1,0,0,3,1,1,0,0,0,4,1,5


In [27]:
labels.head(2)

Unnamed: 0,client_id,cluster
0,124710,2
1,382440,2


## Transform dataset

In [28]:
data = pd.merge(original_data, labels, on='client_id')
data.head()

Unnamed: 0,client_id,residence_country 1,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other region 1,residence_other region 2,residence_other region 3,residence_other region 4,residence_other region 5,residence_other region 6,residence_other region 7,residence_other region 8,generation_gen silent/boomers,generation_gen x,generation_gen z/alpha,generation_millennials,category_children clothes,category_handbags,category_men clothes,category_men shoes,category_other,category_soft,category_women clothes,category_women shoes,color_cool_jewel,color_neutral_pastel,color_other,color_vivid_metallic,color_warm_earth,season_autumn,season_spring,season_summer,season_winter,total_sum,cluster
0,124710,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2,3,1,0,2,4,2,6,1,0,3,0,2,0,10,12,2
1,382440,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,2,0,0,1,0,0,3,1,1,0,0,0,4,1,5,2
2,1311915,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,4,3,2,0,0,2,3,6,1,2,1,7,1,3,2,13,2
3,1958756,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,2,6,6,25,5,1,10,1,25,14,2,13,7,20,16,12,55,1
4,1958794,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,3,10,4,5,1,1,3,3,8,5,2,9,9,9,6,3,27,0


In [29]:
#I will add proportion columns to account for the relative preference towards a given season, product category, color

In [30]:
data.columns

Index(['client_id', 'residence_country 1', 'residence_country 2',
       'residence_country 3', 'residence_country 4', 'residence_country 5',
       'residence_country 6', 'residence_country 7',
       'residence_other region 1', 'residence_other region 2',
       'residence_other region 3', 'residence_other region 4',
       'residence_other region 5', 'residence_other region 6',
       'residence_other region 7', 'residence_other region 8',
       'generation_gen silent/boomers', 'generation_gen x',
       'generation_gen z/alpha', 'generation_millennials',
       'category_children clothes', 'category_handbags',
       'category_men clothes', 'category_men shoes', 'category_other',
       'category_soft', 'category_women clothes', 'category_women shoes',
       'color_cool_jewel', 'color_neutral_pastel', 'color_other',
       'color_vivid_metallic', 'color_warm_earth', 'season_autumn',
       'season_spring', 'season_summer', 'season_winter', 'total_sum',
       'cluster'],
      dt

In [31]:
category_columns = [
    'category_children clothes', 'category_handbags',
       'category_men clothes', 'category_men shoes', 'category_other',
       'category_soft', 'category_women clothes', 'category_women shoes'
]

In [32]:
color_columns = ['color_cool_jewel', 'color_neutral_pastel', 'color_other',
       'color_vivid_metallic', 'color_warm_earth'
]

In [33]:
season_columns = ['season_autumn', 'season_spring', 'season_summer',
       'season_winter'
]

In [34]:
for col in category_columns:
    data[f'{col}_prop'] = data[col] / data['total_sum']

In [35]:
for col in color_columns:
    data[f'{col}_prop'] = data[col] / data['total_sum']

In [36]:
for col in season_columns:
    data[f'{col}_prop'] = data[col] / data['total_sum']

In [37]:
data.head()

Unnamed: 0,client_id,residence_country 1,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other region 1,residence_other region 2,residence_other region 3,residence_other region 4,residence_other region 5,residence_other region 6,residence_other region 7,residence_other region 8,generation_gen silent/boomers,generation_gen x,generation_gen z/alpha,generation_millennials,category_children clothes,category_handbags,category_men clothes,category_men shoes,category_other,category_soft,category_women clothes,category_women shoes,color_cool_jewel,color_neutral_pastel,color_other,color_vivid_metallic,color_warm_earth,season_autumn,season_spring,season_summer,season_winter,total_sum,cluster,category_children clothes_prop,category_handbags_prop,category_men clothes_prop,category_men shoes_prop,category_other_prop,category_soft_prop,category_women clothes_prop,category_women shoes_prop,color_cool_jewel_prop,color_neutral_pastel_prop,color_other_prop,color_vivid_metallic_prop,color_warm_earth_prop,season_autumn_prop,season_spring_prop,season_summer_prop,season_winter_prop
0,124710,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2,3,1,0,2,4,2,6,1,0,3,0,2,0,10,12,2,0.0,0.0,0.166667,0.25,0.083333,0.0,0.166667,0.333333,0.166667,0.5,0.083333,0.0,0.25,0.0,0.166667,0.0,0.833333
1,382440,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,2,0,0,1,0,0,3,1,1,0,0,0,4,1,5,2,0.0,0.0,0.4,0.4,0.0,0.0,0.2,0.0,0.0,0.6,0.2,0.2,0.0,0.0,0.0,0.8,0.2
2,1311915,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,4,3,2,0,0,2,3,6,1,2,1,7,1,3,2,13,2,0.0,0.153846,0.307692,0.230769,0.153846,0.0,0.0,0.153846,0.230769,0.461538,0.076923,0.153846,0.076923,0.538462,0.076923,0.230769,0.153846
3,1958756,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,2,6,6,25,5,1,10,1,25,14,2,13,7,20,16,12,55,1,0.0,0.036364,0.109091,0.109091,0.454545,0.090909,0.018182,0.181818,0.018182,0.454545,0.254545,0.036364,0.236364,0.127273,0.363636,0.290909,0.218182
4,1958794,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,3,10,4,5,1,1,3,3,8,5,2,9,9,9,6,3,27,0,0.0,0.111111,0.37037,0.148148,0.185185,0.037037,0.037037,0.111111,0.111111,0.296296,0.185185,0.074074,0.333333,0.333333,0.333333,0.222222,0.111111


## Averages by cluster

In [38]:
columns_to_exclude = ['client_id', 'cluster']

In [39]:
columns_to_average = [col for col in data.columns if col not in columns_to_exclude]

In [40]:
averages = data.groupby('cluster')[columns_to_average].mean().reset_index()

In [41]:
averages

Unnamed: 0,cluster,residence_country 1,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other region 1,residence_other region 2,residence_other region 3,residence_other region 4,residence_other region 5,residence_other region 6,residence_other region 7,residence_other region 8,generation_gen silent/boomers,generation_gen x,generation_gen z/alpha,generation_millennials,category_children clothes,category_handbags,category_men clothes,category_men shoes,category_other,category_soft,category_women clothes,category_women shoes,color_cool_jewel,color_neutral_pastel,color_other,color_vivid_metallic,color_warm_earth,season_autumn,season_spring,season_summer,season_winter,total_sum,category_children clothes_prop,category_handbags_prop,category_men clothes_prop,category_men shoes_prop,category_other_prop,category_soft_prop,category_women clothes_prop,category_women shoes_prop,color_cool_jewel_prop,color_neutral_pastel_prop,color_other_prop,color_vivid_metallic_prop,color_warm_earth_prop,season_autumn_prop,season_spring_prop,season_summer_prop,season_winter_prop
0,0,0.480916,0.109051,0.140676,0.026172,0.015267,0.009815,0.031625,0.034896,0.001091,0.009815,0.068702,0.011996,0.025082,0.026172,0.008724,0.116685,0.435115,0.006543,0.441658,0.642312,1.785169,7.399128,3.452563,6.34024,0.954198,2.984733,2.818975,3.112323,9.615049,5.7012,1.166848,6.781897,6.36205,6.549618,6.451472,7.014177,26.377317,0.022738,0.069747,0.275156,0.132978,0.244548,0.036774,0.10866,0.109399,0.116964,0.363607,0.219934,0.044915,0.254581,0.240988,0.247761,0.242455,0.268796
1,1,0.28436,0.127962,0.175355,0.021327,0.026066,0.052133,0.042654,0.035545,0.004739,0.021327,0.104265,0.014218,0.028436,0.042654,0.018957,0.158768,0.42891,0.016588,0.395735,2.033175,2.251185,17.559242,6.329384,12.419431,2.933649,4.969194,4.191943,7.21564,17.827014,11.021327,2.521327,14.101896,13.227488,12.684834,12.57109,14.203791,52.687204,0.040232,0.042897,0.332507,0.120435,0.236187,0.055998,0.091703,0.080041,0.137875,0.33686,0.209899,0.048472,0.266894,0.253341,0.240263,0.237357,0.269039
2,2,0.607202,0.100277,0.063712,0.022161,0.014404,0.01108,0.030471,0.019391,0.0,0.007756,0.052078,0.012188,0.036011,0.019945,0.003324,0.099169,0.421053,0.00554,0.474238,0.181163,0.747368,2.225485,1.13795,2.062604,0.26482,0.931302,0.824931,0.953463,3.274792,1.798892,0.359557,1.98892,2.176177,2.077562,2.105263,2.01662,8.375623,0.017159,0.106561,0.243805,0.134039,0.241201,0.030249,0.124967,0.102019,0.10495,0.396053,0.220948,0.044006,0.234042,0.271694,0.246889,0.248106,0.23331


## ANOVA

In [42]:
#H0: Avg Cluster 0 = Avg Cluster 1 = Avg Cluster 2
#H1: At least one different 

p_values = {}

for col in data.columns[1:]:  # Exclude the 'client_id' column
    clusters = [data[data['cluster'] == i][col] for i in range(0, 3)]  # Assuming clusters are labeled 1, 2, 3
    f_stat, p_value = f_oneway(*clusters)
    p_values[col] = p_value

# Display all p-values for variables
print("All p-values for variables:")
for var, p_val in p_values.items():
    print(f"Variable: {var}, p-value: {p_val}")

# Collect variables with p-value < 0.05 into a list
significant_variables = [var for var, p_val in p_values.items() if p_val < 0.05]

# Display variables with p-value < 0.05
print("\nVariables with p-value < 0.05:")
print(significant_variables)

All p-values for variables:
Variable: residence_country 1, p-value: 5.385473997573791e-35
Variable: residence_country 2, p-value: 0.24204245246197206
Variable: residence_country 3, p-value: 6.797384467541896e-16
Variable: residence_country 4, p-value: 0.7757085040938272
Variable: residence_country 5, p-value: 0.22454751014141985
Variable: residence_country 6, p-value: 2.4398511900468533e-09
Variable: residence_country 7, p-value: 0.4396059699978627
Variable: residence_other region 1, p-value: 0.023802074506970035
Variable: residence_other region 2, p-value: 0.01753679651671083
Variable: residence_other region 3, p-value: 0.0435048762222981
Variable: residence_other region 4, p-value: 0.0003215760609359097
Variable: residence_other region 5, p-value: 0.9358679343297513
Variable: residence_other region 6, p-value: 0.28124332999861046
Variable: residence_other region 7, p-value: 0.02482320543085901
Variable: residence_other region 8, p-value: 0.0018372226597588549
Variable: generation_gen

## Tukey HSD (Honestly Significant Difference) 

In [43]:
significant_variables = ['residence_country 1', 'residence_country 3', 'residence_country 6', 'residence_other region 1', 'residence_other region 2', 'residence_other region 3', 'residence_other region 4', 'residence_other region 7', 'residence_other region 8', 'generation_gen silent/boomers', 'generation_millennials', 'category_children clothes', 'category_handbags', 'category_men clothes', 'category_men shoes', 'category_other', 'category_soft', 'category_women clothes', 'category_women shoes', 'color_cool_jewel', 'color_neutral_pastel', 'color_other', 'color_vivid_metallic', 'color_warm_earth', 'season_autumn', 'season_spring', 'season_summer', 'season_winter', 'total_sum', 'cluster', 'category_children clothes_prop', 'category_handbags_prop', 'category_men clothes_prop', 'category_soft_prop', 'category_women clothes_prop', 'category_women shoes_prop', 'color_cool_jewel_prop', 'color_neutral_pastel_prop', 'color_warm_earth_prop', 'season_autumn_prop', 'season_winter_prop']

In [44]:
#where are the differences

for var in significant_variables:
    tukey_results = pairwise_tukeyhsd(data[var], data['cluster'])

    print(f"Tukey HSD results for variable '{var}':")
    print(tukey_results)

Tukey HSD results for variable 'residence_country 1':
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower   upper  reject
---------------------------------------------------
     0      1  -0.1966   0.0 -0.2637 -0.1294   True
     0      2   0.1263   0.0    0.08  0.1726   True
     1      2   0.3228   0.0  0.2611  0.3846   True
---------------------------------------------------
Tukey HSD results for variable 'residence_country 3':
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
     0      1   0.0347 0.1183 -0.0065  0.0758  False
     0      2   -0.077    0.0 -0.1053 -0.0486   True
     1      2  -0.1116    0.0 -0.1495 -0.0738   True
----------------------------------------------------
Tukey HSD results for variable 'residence_country 6':
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reje

Tukey HSD results for variable 'category_women shoes':
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower   upper  reject
---------------------------------------------------
     0      1    1.373   0.0  1.0298  1.7161   True
     0      2   -1.994   0.0 -2.2306 -1.7575   True
     1      2   -3.367   0.0 -3.6824 -3.0516   True
---------------------------------------------------
Tukey HSD results for variable 'color_cool_jewel':
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower   upper  reject
---------------------------------------------------
     0      1   4.1033   0.0  3.7983  4.4083   True
     0      2  -2.1589   0.0 -2.3691 -1.9486   True
     1      2  -6.2622   0.0 -6.5426 -5.9818   True
---------------------------------------------------
Tukey HSD results for variable 'color_neutral_pastel':
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj  lower    upper   reject
---

Tukey HSD results for variable 'color_cool_jewel_prop':
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
     0      1   0.0209 0.0114  0.0038   0.038   True
     0      2   -0.012 0.0441 -0.0238 -0.0002   True
     1      2  -0.0329    0.0 -0.0486 -0.0172   True
----------------------------------------------------
Tukey HSD results for variable 'color_neutral_pastel_prop':
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     0      1  -0.0267 0.0627 -0.0546 0.0011  False
     0      2   0.0324 0.0002  0.0133 0.0516   True
     1      2   0.0592    0.0  0.0336 0.0848   True
---------------------------------------------------
Tukey HSD results for variable 'color_warm_earth_prop':
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   u