# Final project | Cristiane Carneiro

Validating and describing differences across clusters

## Import libraries

In [1]:
# data
# ==============================================================================
import numpy as np
import pandas as pd

# charts
# ==============================================================================
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits import mplot3d


# pre-processing 
# ==============================================================================
from sklearn.preprocessing import StandardScaler

#from sklearn import metrics
from sklearn.cluster import KMeans, DBSCAN
from yellowbrick.cluster import KElbowVisualizer
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

# clusters quality 
# ==============================================================================
from ds_utils.unsupervised import plot_cluster_cardinality, plot_cluster_magnitude, plot_magnitude_vs_cardinality
from scipy.spatial.distance import euclidean
from scipy.stats import f_oneway

# warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

#display
# ==============================================================================
pd.set_option('display.max_columns', None)

## Import dataset

In [2]:
original_data = pd.read_csv('../data/data_men.csv')

In [3]:
labels = pd.read_csv('../data/clusters_men')

In [4]:
original_data.head(2)

Unnamed: 0,client_id,mailing,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other africa,residence_other asia,residence_other central america,residence_other europe,residence_other middle east,residence_other north america,residence_other oceania,residence_other south america,generation_gen alpha,generation_gen x,generation_gen z,generation_old millennials,generation_silent,generation_young millennials,bestchannel_mostly store,bestchannel_mostly web,bestchannel_store,total_sum,category_children clothes_prop,category_handbags_prop,category_men clothes_prop,category_men shoes_prop,category_other_prop,category_soft_prop,category_women clothes_prop,category_women shoes_prop,color_cool_prop,color_earth_prop,color_jewel_prop,color_metallic_prop,color_neutral_prop,color_other_prop,color_pastel_prop,color_vivid_prop,color_warm_prop,season_autumn_prop,season_spring_prop,season_summer_prop,season_winter_prop
0,124710,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,12,0.0,0.0,0.166667,0.25,0.083333,0.0,0.166667,0.333333,0.166667,0.0,0.0,0.0,0.5,0.083333,0.0,0.0,0.25,0.0,0.166667,0.0,0.833333
1,382440,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,5,0.0,0.0,0.4,0.4,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.2,0.6,0.2,0.0,0.0,0.0,0.0,0.0,0.8,0.2


In [5]:
labels.head(2)

Unnamed: 0,client_id,cluster
0,124710,0
1,382440,0


In [6]:
data = pd.merge(original_data, labels, on='client_id')
data.head()

Unnamed: 0,client_id,mailing,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other africa,residence_other asia,residence_other central america,residence_other europe,residence_other middle east,residence_other north america,residence_other oceania,residence_other south america,generation_gen alpha,generation_gen x,generation_gen z,generation_old millennials,generation_silent,generation_young millennials,bestchannel_mostly store,bestchannel_mostly web,bestchannel_store,total_sum,category_children clothes_prop,category_handbags_prop,category_men clothes_prop,category_men shoes_prop,category_other_prop,category_soft_prop,category_women clothes_prop,category_women shoes_prop,color_cool_prop,color_earth_prop,color_jewel_prop,color_metallic_prop,color_neutral_prop,color_other_prop,color_pastel_prop,color_vivid_prop,color_warm_prop,season_autumn_prop,season_spring_prop,season_summer_prop,season_winter_prop,cluster
0,124710,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,12,0.0,0.0,0.166667,0.25,0.083333,0.0,0.166667,0.333333,0.166667,0.0,0.0,0.0,0.5,0.083333,0.0,0.0,0.25,0.0,0.166667,0.0,0.833333,0
1,382440,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,5,0.0,0.0,0.4,0.4,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.2,0.6,0.2,0.0,0.0,0.0,0.0,0.0,0.8,0.2,0
2,1311915,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,13,0.0,0.153846,0.307692,0.230769,0.153846,0.0,0.0,0.153846,0.230769,0.0,0.0,0.153846,0.461538,0.076923,0.0,0.0,0.076923,0.538462,0.076923,0.230769,0.153846,0
3,1958756,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,55,0.0,0.036364,0.109091,0.109091,0.454545,0.090909,0.018182,0.181818,0.018182,0.018182,0.0,0.018182,0.527273,0.254545,0.0,0.0,0.163636,0.127273,0.363636,0.290909,0.218182,1
4,1958794,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,27,0.0,0.111111,0.37037,0.148148,0.185185,0.037037,0.037037,0.111111,0.111111,0.0,0.0,0.074074,0.296296,0.185185,0.0,0.0,0.333333,0.333333,0.333333,0.222222,0.111111,2


In [7]:
columns_to_exclude = ['client_id', 'cluster']

In [8]:
columns_to_average = [col for col in data.columns if col not in columns_to_exclude]

In [9]:
averages = data.groupby('cluster')[columns_to_average].mean().reset_index()

In [10]:
averages

Unnamed: 0,cluster,mailing,residence_country 2,residence_country 3,residence_country 4,residence_country 5,residence_country 6,residence_country 7,residence_other africa,residence_other asia,residence_other central america,residence_other europe,residence_other middle east,residence_other north america,residence_other oceania,residence_other south america,generation_gen alpha,generation_gen x,generation_gen z,generation_old millennials,generation_silent,generation_young millennials,bestchannel_mostly store,bestchannel_mostly web,bestchannel_store,total_sum,category_children clothes_prop,category_handbags_prop,category_men clothes_prop,category_men shoes_prop,category_other_prop,category_soft_prop,category_women clothes_prop,category_women shoes_prop,color_cool_prop,color_earth_prop,color_jewel_prop,color_metallic_prop,color_neutral_prop,color_other_prop,color_pastel_prop,color_vivid_prop,color_warm_prop,season_autumn_prop,season_spring_prop,season_summer_prop,season_winter_prop
0,0,0.922228,0.100386,0.062328,0.022614,0.014341,0.011031,0.030336,0.012135,0.020408,0.0,0.052951,0.034749,0.02096,0.003309,0.00717,0.0,0.422504,0.004413,0.39989,0.002758,0.072256,0.036404,0.000552,0.963045,8.380585,0.015328,0.108821,0.235585,0.136359,0.243476,0.030566,0.124271,0.105594,0.099096,0.003052,0.004231,0.01945,0.409312,0.223927,0.011059,0.009747,0.220128,0.268355,0.248053,0.248126,0.235466
1,1,0.966746,0.123515,0.163895,0.021378,0.028504,0.049881,0.04038,0.014252,0.042755,0.004751,0.109264,0.028504,0.038005,0.021378,0.021378,0.004751,0.432304,0.011876,0.327791,0.004751,0.068884,0.068884,0.0,0.931116,52.857482,0.040608,0.045254,0.330949,0.120536,0.232497,0.051317,0.098582,0.080257,0.130167,0.004925,0.003179,0.012871,0.368245,0.206618,0.013552,0.006026,0.254417,0.253309,0.241338,0.240271,0.265082
2,2,0.963736,0.110989,0.149451,0.025275,0.014286,0.010989,0.032967,0.012088,0.025275,0.001099,0.064835,0.027473,0.030769,0.007692,0.010989,0.001099,0.430769,0.007692,0.361538,0.003297,0.083516,0.054945,0.0,0.943956,26.475824,0.02628,0.063801,0.292594,0.128285,0.241743,0.038386,0.106702,0.102209,0.117539,0.005762,0.002807,0.014094,0.381656,0.215498,0.012652,0.007054,0.242939,0.249714,0.242628,0.242822,0.264835


In [11]:
p_values = {}

for col in data.columns[1:]:  # Exclude the 'client_id' column
    clusters = [data[data['cluster'] == i][col] for i in range(0, 3)]  # Assuming clusters are labeled 1, 2, 3
    f_stat, p_value = f_oneway(*clusters)
    p_values[col] = p_value

# Display all p-values for variables
print("All p-values for variables:")
for var, p_val in p_values.items():
    print(f"Variable: {var}, p-value: {p_val}")

# Collect variables with p-value < 0.05 into a list
significant_variables = [var for var, p_val in p_values.items() if p_val < 0.05]

# Display variables with p-value < 0.05
print("\nVariables with p-value < 0.05:")
print(significant_variables)

All p-values for variables:
Variable: mailing, p-value: 4.26036686691053e-06
Variable: residence_country 2, p-value: 0.33563591768348705
Variable: residence_country 3, p-value: 1.9011602140026698e-16
Variable: residence_country 4, p-value: 0.8774465358429038
Variable: residence_country 5, p-value: 0.10055666557228939
Variable: residence_country 6, p-value: 2.963396862316512e-08
Variable: residence_country 7, p-value: 0.5745616240408469
Variable: residence_other africa, p-value: 0.9345545097844945
Variable: residence_other asia, p-value: 0.02922981810774876
Variable: residence_other central america, p-value: 0.01725438328312699
Variable: residence_other europe, p-value: 0.00011408288086662001
Variable: residence_other middle east, p-value: 0.5452487208970085
Variable: residence_other north america, p-value: 0.08147054739872762
Variable: residence_other oceania, p-value: 0.00030851977802912587
Variable: residence_other south america, p-value: 0.03127457927735483
Variable: generation_gen 