In [49]:
# Library ----------------------------------------------------------------
library(dplyr)
library(tidyr)
library(ggplot2)

# 1. Load Data and Clustering Results ---------------------------
kmeans_result <- readRDS("../../../_PrivateFolder_datagotchi_federal_2025/data/clustering/qc2022/03_pilot1_kmeans.rds")
df_pilot1_2022 <- readRDS("../../../_PrivateFolder_datagotchi_federal_2025/data/clustering/qc2022/03_pilot1_2022.rds")
cluster_means <- read.csv("../../../_PrivateFolder_datagotchi_federal_2025/data/clustering/qc2022/03_pilot1_clusters_desc.csv")


In [50]:
df_long <-cluster_means %>%
  pivot_longer(cols = -cluster, names_to = "variable", values_to = "mean")

ggplot(df_long, aes(x = variable, y = mean, fill = as.factor(cluster))) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Comparaison des moyennes par cluster", x = "Variable", y = "Moyenne", fill = "Cluster")

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/clusterMeans_barplot.pdf",  width = 16, height = 12)


In [51]:
ggplot(df_long, aes(x = cluster, y = variable, fill = mean)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red") +
  theme_minimal() +
  labs(title = "Heatmap des moyennes par cluster", x = "Cluster", y = "Variable", fill = "Moyenne")

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/clusterMeans_heatmap.pdf",  width = 16, height = 12)

In [52]:
# Barplot des moyennes sans les SES

df_filtered <- cluster_means %>%
  select(-matches("^ses_"), -female, -male, -educ, -age, -langFr, -langEn, -immigrant)


df_long_filtered <- df_filtered %>%
  pivot_longer(cols = -cluster, names_to = "variable", values_to = "mean")

ggplot(df_long_filtered, aes(x = variable, y = mean, fill = as.factor(cluster))) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Comparaison des moyennes par cluster (sans les ses_)", 
       x = "Variable", 
       y = "Moyenne", 
       fill = "Cluster")

       ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/clusterMeans_barplotFiltered.pdf",  width = 16, height = 12)

In [53]:
# Heatmap sans les SES

ggplot(df_long_filtered, aes(x = cluster, y = variable, fill = mean)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red") +
  theme_minimal() +
  labs(title = "Heatmap des moyennes par cluster", x = "Cluster", y = "Variable", fill = "Moyenne")

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/clusterMeans_heatmapFiltered.pdf",  width = 16, height = 12)

In [None]:
# Moyenne de certaines SES par cluster
# Age par cluster ( 0 = 34 et moins, 0.5 = 35-54, 1 = 55+)

# Revenu par cluster (0.14 = moins de 30k, 0.28 = 31-60k, 0.43 = 61-90k, 0.57 = 91-110k,
# 0.71 = 111-150k, 0.86 = 151-200k, 1 = 200k+)

# Education ( 0 = BHS, 0.5 = college, 1 = univ)


In [54]:
# Combien de personnes par cluster

table(df_pilot1_2022$cluster_name)



 Charlie  Jacques Jean-Guy    Julie 
     139      165      389      502 
   Karim  Mélanie 
     124      181 

In [45]:
# Enjeux pilote 1 qc_2022

df_pilot1_2022_enjeux <- read.csv("../../../_SharedFolder_datagotchi_federal_2024/data/clustering/qc_2022/pilote-1-quebec-prov-2022.csv")