# Final project | Cristiane Carneiro

Validating and describing differences across clusters

## Import libraries

In [7]:
# data
# ==============================================================================
import numpy as np
import pandas as pd

# charts
# ==============================================================================
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits import mplot3d


# pre-processing 
# ==============================================================================
from sklearn.preprocessing import StandardScaler

#from sklearn import metrics
from sklearn.cluster import KMeans, DBSCAN
from yellowbrick.cluster import KElbowVisualizer
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

# clusters quality 
# ==============================================================================
from ds_utils.unsupervised import plot_cluster_cardinality, plot_cluster_magnitude, plot_magnitude_vs_cardinality
from scipy.spatial.distance import euclidean
from scipy.stats import f_oneway

# warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

#display
# ==============================================================================
pd.set_option('display.max_columns', None)

## Import dataset

In [8]:
original_data = pd.read_csv('../data/data_clean_women.csv')

In [9]:
labels = pd.read_csv('../data/clusters_women')

In [10]:
original_data.head(2)

Unnamed: 0,client_id,residence_country 1,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other region 1,residence_other region 2,residence_other region 3,residence_other region 4,residence_other region 5,residence_other region 6,residence_other region 7,residence_other region 8,generation_gen silent/boomers,generation_gen x,generation_gen z/alpha,generation_millennials,category_children clothes,category_handbags,category_men clothes,category_men shoes,category_other,category_soft,category_women clothes,category_women shoes,color_cool_jewel,color_neutral_pastel,color_other,color_vivid_metallic,color_warm_earth,season_autumn,season_spring,season_summer,season_winter,total_sum
0,76154,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,4,0,1,2,0,2,4,1,0,0,5
1,292032,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,1,0,0,0,6,3,1,5,2,1,3,0,2,10,0,12


In [11]:
labels.head(2)

Unnamed: 0,client_id,cluster
0,76154,1
1,292032,1


In [12]:
data = pd.merge(original_data, labels, on='client_id')
data.head()

Unnamed: 0,client_id,residence_country 1,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other region 1,residence_other region 2,residence_other region 3,residence_other region 4,residence_other region 5,residence_other region 6,residence_other region 7,residence_other region 8,generation_gen silent/boomers,generation_gen x,generation_gen z/alpha,generation_millennials,category_children clothes,category_handbags,category_men clothes,category_men shoes,category_other,category_soft,category_women clothes,category_women shoes,color_cool_jewel,color_neutral_pastel,color_other,color_vivid_metallic,color_warm_earth,season_autumn,season_spring,season_summer,season_winter,total_sum,cluster
0,76154,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,4,0,1,2,0,2,4,1,0,0,5,1
1,292032,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,1,0,0,0,6,3,1,5,2,1,3,0,2,10,0,12,1
2,513675,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,12,2,1,1,33,2,8,18,3,5,17,6,7,8,30,51,2
3,514314,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,5,7,0,12,9,14,8,3,24,10,4,14,19,0,2,34,55,2
4,517104,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,2,0,2,1


In [13]:
columns_to_exclude = ['client_id', 'cluster']

In [14]:
columns_to_average = [col for col in data.columns if col not in columns_to_exclude]

In [15]:
averages = data.groupby('cluster')[columns_to_average].mean().reset_index()

In [16]:
averages

Unnamed: 0,cluster,residence_country 1,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other region 1,residence_other region 2,residence_other region 3,residence_other region 4,residence_other region 5,residence_other region 6,residence_other region 7,residence_other region 8,generation_gen silent/boomers,generation_gen x,generation_gen z/alpha,generation_millennials,category_children clothes,category_handbags,category_men clothes,category_men shoes,category_other,category_soft,category_women clothes,category_women shoes,color_cool_jewel,color_neutral_pastel,color_other,color_vivid_metallic,color_warm_earth,season_autumn,season_spring,season_summer,season_winter,total_sum
0,0,0.467635,0.079921,0.095773,0.028402,0.035667,0.043593,0.054822,0.033025,0.001982,0.016513,0.063408,0.004624,0.037649,0.030383,0.006605,0.148613,0.451123,0.006605,0.393659,1.389036,2.38111,4.291942,1.726552,8.691546,1.646631,10.599736,5.145971,4.241744,12.540291,7.136724,2.021136,9.932629,8.990092,9.347424,8.401585,9.133421,35.872523
1,1,0.601018,0.070992,0.031043,0.032061,0.036132,0.012723,0.058524,0.019847,0.000254,0.005852,0.047328,0.003053,0.047837,0.026463,0.00687,0.110178,0.423664,0.007888,0.45827,0.181679,0.660814,0.675064,0.355471,1.580662,0.257506,1.931807,1.083715,0.716794,2.530534,1.445547,0.404071,1.629771,1.838931,1.65827,1.658524,1.570992,6.726718
2,2,0.440154,0.068211,0.055341,0.033462,0.055341,0.057915,0.039897,0.033462,0.003861,0.016731,0.082368,0.002574,0.043758,0.045045,0.021879,0.138996,0.460746,0.009009,0.391248,2.462033,2.980695,7.054054,2.178893,12.097812,2.859717,20.661519,7.837838,7.2574,19.79408,10.782497,3.431145,16.867439,13.870013,14.634492,14.352638,15.275418,58.132561
3,3,0.52592,0.080728,0.065691,0.025722,0.032054,0.036803,0.058567,0.024931,0.00277,0.009497,0.058567,0.004353,0.043926,0.021765,0.008706,0.137317,0.42501,0.006727,0.430946,0.642659,1.502968,2.165018,0.976652,4.815987,0.743965,5.358924,3.068065,2.169767,7.073209,4.018995,1.060546,4.951721,4.876533,4.766917,4.772853,4.857934,19.274238


In [18]:
p_values = {}

for col in data.columns[1:]:  # Exclude the 'client_id' column
    clusters = [data[data['cluster'] == i][col] for i in range(0, 3)]  # Assuming clusters are labeled 1, 2, 3
    f_stat, p_value = f_oneway(*clusters)
    p_values[col] = p_value

# Display all p-values for variables
print("All p-values for variables:")
for var, p_val in p_values.items():
    print(f"Variable: {var}, p-value: {p_val}")

# Collect variables with p-value < 0.05 into a list
significant_variables = [var for var, p_val in p_values.items() if p_val < 0.05]

# Display variables with p-value < 0.05
print("\nVariables with p-value < 0.05:")
print(significant_variables)

All p-values for variables:
Variable: residence_country 1, p-value: 3.74205443403428e-27
Variable: residence_country 2, p-value: 0.45634931791716893
Variable: residence_country 3, p-value: 5.222556931464643e-22
Variable: residence_country 4, p-value: 0.736093517037182
Variable: residence_country 5, p-value: 0.03190500130180278
Variable: residence_country 6, p-value: 1.2094237439272589e-17
Variable: residence_country 7, p-value: 0.11550009430199634
Variable: residence_other region 1, p-value: 0.004858893097467426
Variable: residence_other region 2, p-value: 0.012185520464606206
Variable: residence_other region 3, p-value: 0.0001847083643645271
Variable: residence_other region 4, p-value: 0.00015904801363469297
Variable: residence_other region 5, p-value: 0.6156753113623902
Variable: residence_other region 6, p-value: 0.26283471244811873
Variable: residence_other region 7, p-value: 0.02030915820002974
Variable: residence_other region 8, p-value: 0.00012329882204459904
Variable: generatio