# Brand Co. Client Segmentation

Validating differences across clusters through ANOVA and Tukey HSD

## Import libraries

In [44]:
# data
# ==============================================================================
import numpy as np
import pandas as pd

# charts
# ==============================================================================
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits import mplot3d

# pre-processing 
# ==============================================================================
from sklearn.preprocessing import StandardScaler

#from sklearn import metrics
from sklearn.cluster import KMeans, DBSCAN
from yellowbrick.cluster import KElbowVisualizer
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

#display
# ==============================================================================
pd.set_option('display.max_columns', None)

#stats
# ==============================================================================
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

## Import dataset

In [45]:
original_data = pd.read_csv('../data/clean/data_clean_women.csv')

In [46]:
labels = pd.read_csv('../data/clusters/clusters_women.csv')

In [47]:
original_data.head(2)

Unnamed: 0,client_id,residence_country 1,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other region 1,residence_other region 2,residence_other region 3,residence_other region 4,residence_other region 5,residence_other region 6,residence_other region 7,residence_other region 8,generation_gen silent/boomers,generation_gen x,generation_gen z/alpha,generation_millennials,category_children clothes,category_handbags,category_men clothes,category_men shoes,category_other,category_soft,category_women clothes,category_women shoes,color_cool_jewel,color_neutral_pastel,color_other,color_vivid_metallic,color_warm_earth,season_autumn,season_spring,season_summer,season_winter,total_sum
0,76154,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,4,0,1,2,0,2,4,1,0,0,5
1,292032,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,1,0,0,0,6,3,1,5,2,1,3,0,2,10,0,12


In [48]:
labels.head(2)

Unnamed: 0,client_id,cluster
0,76154,1
1,292032,1


## Transform dataset

In [49]:
data = pd.merge(original_data, labels, on='client_id')
data.head()

Unnamed: 0,client_id,residence_country 1,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other region 1,residence_other region 2,residence_other region 3,residence_other region 4,residence_other region 5,residence_other region 6,residence_other region 7,residence_other region 8,generation_gen silent/boomers,generation_gen x,generation_gen z/alpha,generation_millennials,category_children clothes,category_handbags,category_men clothes,category_men shoes,category_other,category_soft,category_women clothes,category_women shoes,color_cool_jewel,color_neutral_pastel,color_other,color_vivid_metallic,color_warm_earth,season_autumn,season_spring,season_summer,season_winter,total_sum,cluster
0,76154,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,4,0,1,2,0,2,4,1,0,0,5,1
1,292032,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,1,0,0,0,6,3,1,5,2,1,3,0,2,10,0,12,1
2,513675,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,12,2,1,1,33,2,8,18,3,5,17,6,7,8,30,51,0
3,514314,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,5,7,0,12,9,14,8,3,24,10,4,14,19,0,2,34,55,0
4,517104,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,2,0,2,1


In [50]:
#I will add proportion columns to account for the relative preference towards a given season, product category, color

In [51]:
data.columns

Index(['client_id', 'residence_country 1', 'residence_country 2',
       'residence_country 3', 'residence_country 4', 'residence_country 5',
       'residence_country 6', 'residence_country 7',
       'residence_other region 1', 'residence_other region 2',
       'residence_other region 3', 'residence_other region 4',
       'residence_other region 5', 'residence_other region 6',
       'residence_other region 7', 'residence_other region 8',
       'generation_gen silent/boomers', 'generation_gen x',
       'generation_gen z/alpha', 'generation_millennials',
       'category_children clothes', 'category_handbags',
       'category_men clothes', 'category_men shoes', 'category_other',
       'category_soft', 'category_women clothes', 'category_women shoes',
       'color_cool_jewel', 'color_neutral_pastel', 'color_other',
       'color_vivid_metallic', 'color_warm_earth', 'season_autumn',
       'season_spring', 'season_summer', 'season_winter', 'total_sum',
       'cluster'],
      dt

In [52]:
category_columns = [
    'category_children clothes', 'category_handbags',
       'category_men clothes', 'category_men shoes', 'category_other',
       'category_soft', 'category_women clothes', 'category_women shoes'
]

In [53]:
color_columns = ['color_cool_jewel', 'color_neutral_pastel', 'color_other',
       'color_vivid_metallic', 'color_warm_earth'
]

In [54]:
season_columns = ['season_autumn', 'season_spring', 'season_summer',
       'season_winter'
]

In [55]:
for col in category_columns:
    data[f'{col}_prop'] = data[col] / data['total_sum']

In [56]:
for col in color_columns:
    data[f'{col}_prop'] = data[col] / data['total_sum']

In [57]:
for col in season_columns:
    data[f'{col}_prop'] = data[col] / data['total_sum']

In [58]:
data.head()

Unnamed: 0,client_id,residence_country 1,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other region 1,residence_other region 2,residence_other region 3,residence_other region 4,residence_other region 5,residence_other region 6,residence_other region 7,residence_other region 8,generation_gen silent/boomers,generation_gen x,generation_gen z/alpha,generation_millennials,category_children clothes,category_handbags,category_men clothes,category_men shoes,category_other,category_soft,category_women clothes,category_women shoes,color_cool_jewel,color_neutral_pastel,color_other,color_vivid_metallic,color_warm_earth,season_autumn,season_spring,season_summer,season_winter,total_sum,cluster,category_children clothes_prop,category_handbags_prop,category_men clothes_prop,category_men shoes_prop,category_other_prop,category_soft_prop,category_women clothes_prop,category_women shoes_prop,color_cool_jewel_prop,color_neutral_pastel_prop,color_other_prop,color_vivid_metallic_prop,color_warm_earth_prop,season_autumn_prop,season_spring_prop,season_summer_prop,season_winter_prop
0,76154,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,4,0,1,2,0,2,4,1,0,0,5,1,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.8,0.0,0.2,0.4,0.0,0.4,0.8,0.2,0.0,0.0
1,292032,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,1,0,0,0,6,3,1,5,2,1,3,0,2,10,0,12,1,0.0,0.166667,0.083333,0.0,0.0,0.0,0.5,0.25,0.083333,0.416667,0.166667,0.083333,0.25,0.0,0.166667,0.833333,0.0
2,513675,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,12,2,1,1,33,2,8,18,3,5,17,6,7,8,30,51,0,0.0,0.0,0.235294,0.039216,0.019608,0.019608,0.647059,0.039216,0.156863,0.352941,0.058824,0.098039,0.333333,0.117647,0.137255,0.156863,0.588235
3,514314,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,5,7,0,12,9,14,8,3,24,10,4,14,19,0,2,34,55,0,0.0,0.090909,0.127273,0.0,0.218182,0.163636,0.254545,0.145455,0.054545,0.436364,0.181818,0.072727,0.254545,0.345455,0.0,0.036364,0.618182
4,517104,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,2,0,2,1,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.0,0.0,1.0,0.0


## Averages by cluster

In [59]:
columns_to_exclude = ['client_id', 'cluster']

In [60]:
columns_to_average = [col for col in data.columns if col not in columns_to_exclude]

In [61]:
averages = data.groupby('cluster')[columns_to_average].mean().reset_index()

In [62]:
averages

Unnamed: 0,cluster,residence_country 1,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other region 1,residence_other region 2,residence_other region 3,residence_other region 4,residence_other region 5,residence_other region 6,residence_other region 7,residence_other region 8,generation_gen silent/boomers,generation_gen x,generation_gen z/alpha,generation_millennials,category_children clothes,category_handbags,category_men clothes,category_men shoes,category_other,category_soft,category_women clothes,category_women shoes,color_cool_jewel,color_neutral_pastel,color_other,color_vivid_metallic,color_warm_earth,season_autumn,season_spring,season_summer,season_winter,total_sum,category_children clothes_prop,category_handbags_prop,category_men clothes_prop,category_men shoes_prop,category_other_prop,category_soft_prop,category_women clothes_prop,category_women shoes_prop,color_cool_jewel_prop,color_neutral_pastel_prop,color_other_prop,color_vivid_metallic_prop,color_warm_earth_prop,season_autumn_prop,season_spring_prop,season_summer_prop,season_winter_prop
0,0,0.438961,0.077922,0.063203,0.033766,0.045887,0.060606,0.045022,0.031169,0.004329,0.01645,0.079654,0.002597,0.039827,0.042424,0.018182,0.147186,0.458009,0.007792,0.387013,2.252814,2.879654,6.830303,2.34026,11.316017,2.689177,17.481385,7.509957,6.705628,17.984416,10.193074,3.157576,15.258874,12.8329,13.539394,13.199134,13.728139,53.299567,0.040108,0.054114,0.129496,0.045173,0.21095,0.05074,0.327697,0.141723,0.126046,0.337369,0.191086,0.059571,0.285927,0.241178,0.253806,0.248648,0.256368
1,1,0.59634,0.069791,0.035599,0.030973,0.036605,0.015286,0.058327,0.020716,0.000402,0.006235,0.046259,0.003218,0.047265,0.026146,0.006838,0.112832,0.420756,0.007643,0.458769,0.230692,0.719831,0.895213,0.416533,1.948713,0.315768,2.648833,1.361424,0.934634,3.236726,1.739541,0.481094,2.145012,2.311142,2.097144,2.121078,2.007643,8.537007,0.02401,0.094629,0.098326,0.047659,0.224942,0.035622,0.316473,0.158339,0.106459,0.383639,0.208255,0.057951,0.243696,0.280873,0.24571,0.24967,0.223747
2,2,0.484166,0.083937,0.086227,0.025563,0.03243,0.041206,0.05723,0.02976,0.002671,0.013354,0.065624,0.00496,0.041587,0.023274,0.008012,0.145746,0.442198,0.006868,0.405189,0.993895,2.064479,2.961847,1.296452,6.937047,1.118275,7.582984,3.987028,3.040443,9.660435,5.56963,1.508966,7.162533,6.724533,6.87562,6.356353,6.985502,26.942007,0.033548,0.079729,0.109693,0.049856,0.259898,0.041682,0.273943,0.151651,0.112043,0.358843,0.210337,0.056422,0.262355,0.249273,0.253638,0.237825,0.259264


## ANOVA

In [63]:
#H0: Avg Cluster 0 = Avg Cluster 1 = Avg Cluster 2
#H1: At least one different 

p_values = {}

for col in data.columns[1:]:  # Exclude the 'client_id' column
    clusters = [data[data['cluster'] == i][col] for i in range(0, 3)]  # Assuming clusters are labeled 1, 2, 3
    f_stat, p_value = f_oneway(*clusters)
    p_values[col] = p_value

# Display all p-values for variables
print("All p-values for variables:")
for var, p_val in p_values.items():
    print(f"Variable: {var}, p-value: {p_val}")

# Collect variables with p-value < 0.05 into a list
significant_variables = [var for var, p_val in p_values.items() if p_val < 0.05]

# Display variables with p-value < 0.05
print("\nVariables with p-value < 0.05:")
print(significant_variables)

All p-values for variables:
Variable: residence_country 1, p-value: 3.262299292829481e-32
Variable: residence_country 2, p-value: 0.0782113145671382
Variable: residence_country 3, p-value: 8.152783568359068e-20
Variable: residence_country 4, p-value: 0.2869503934259266
Variable: residence_country 5, p-value: 0.12749058018898146
Variable: residence_country 6, p-value: 6.49591724845738e-20
Variable: residence_country 7, p-value: 0.20247173968786114
Variable: residence_other region 1, p-value: 0.01803473378862362
Variable: residence_other region 2, p-value: 0.002833957978373587
Variable: residence_other region 3, p-value: 0.0004717907826847701
Variable: residence_other region 4, p-value: 2.8575152112795306e-06
Variable: residence_other region 5, p-value: 0.3986958410703587
Variable: residence_other region 6, p-value: 0.3670225831577265
Variable: residence_other region 7, p-value: 0.002822790639714544
Variable: residence_other region 8, p-value: 0.0008215098213827125
Variable: generation_g

## Tukey HSD (Honestly Significant Difference) 

In [64]:
significant_variables = ['residence_country 1', 'residence_country 3', 'residence_country 6', 'residence_other region 1', 'residence_other region 2', 'residence_other region 3', 'residence_other region 4', 'residence_other region 7', 'residence_other region 8', 'generation_gen silent/boomers', 'generation_gen x', 'generation_millennials', 'category_children clothes', 'category_handbags', 'category_men clothes', 'category_men shoes', 'category_other', 'category_soft', 'category_women clothes', 'category_women shoes', 'color_cool_jewel', 'color_neutral_pastel', 'color_other', 'color_vivid_metallic', 'color_warm_earth', 'season_autumn', 'season_spring', 'season_summer', 'season_winter', 'total_sum', 'cluster', 'category_children clothes_prop', 'category_handbags_prop', 'category_men clothes_prop', 'category_other_prop', 'category_soft_prop', 'category_women clothes_prop', 'category_women shoes_prop', 'color_cool_jewel_prop', 'color_neutral_pastel_prop', 'color_other_prop', 'color_warm_earth_prop', 'season_autumn_prop', 'season_winter_prop']

In [65]:
#where are the differences? 

for var in significant_variables:
    tukey_results = pairwise_tukeyhsd(data[var], data['cluster'])

    print(f"Tukey HSD results for variable '{var}':")
    print(tukey_results)

Tukey HSD results for variable 'residence_country 1':
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower   upper  reject
---------------------------------------------------
     0      1   0.1574   0.0  0.1195  0.1952   True
     0      2   0.0452 0.026  0.0043  0.0861   True
     1      2  -0.1122   0.0 -0.1401 -0.0842   True
---------------------------------------------------
Tukey HSD results for variable 'residence_country 3':
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
     0      1  -0.0276 0.0005 -0.0449 -0.0103   True
     0      2    0.023 0.0109  0.0043  0.0417   True
     1      2   0.0506    0.0  0.0379  0.0634   True
----------------------------------------------------
Tukey HSD results for variable 'residence_country 6':
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reje

Tukey HSD results for variable 'category_women clothes':
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj  lower    upper   reject
-----------------------------------------------------
     0      1 -14.8326   0.0 -15.3328 -14.3323   True
     0      2  -9.8984   0.0 -10.4393  -9.3575   True
     1      2   4.9342   0.0   4.5644   5.3039   True
-----------------------------------------------------
Tukey HSD results for variable 'category_women shoes':
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower   upper  reject
---------------------------------------------------
     0      1  -6.1485   0.0 -6.3771 -5.9199   True
     0      2  -3.5229   0.0 -3.7701 -3.2758   True
     1      2   2.6256   0.0  2.4567  2.7945   True
---------------------------------------------------
Tukey HSD results for variable 'color_cool_jewel':
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower   upper

Tukey HSD results for variable 'category_women clothes_prop':
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
     0      1  -0.0112 0.4568 -0.0333  0.0108  False
     0      2  -0.0538    0.0 -0.0776 -0.0299   True
     1      2  -0.0425    0.0 -0.0588 -0.0262   True
----------------------------------------------------
Tukey HSD results for variable 'category_women shoes_prop':
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     0      1   0.0166 0.0064  0.0039 0.0294   True
     0      2   0.0099   0.21 -0.0039 0.0237  False
     1      2  -0.0067 0.2194 -0.0161 0.0027  False
---------------------------------------------------
Tukey HSD results for variable 'color_cool_jewel_prop':
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   low