In [21]:
# Library ----------------------------------------------------------------
library(dplyr)
library(tidyr)
library(ggplot2)

# 1. Load Data and Clustering Results ---------------------------
kmeans_result <- readRDS("../../../_PrivateFolder_datagotchi_federal_2025/data/clustering/qc2022/03_pilot1_kmeans.rds")
df_pilot1_2022 <- readRDS("../../../_PrivateFolder_datagotchi_federal_2025/data/clustering/qc2022/03_pilot1_2022.rds")
cluster_means <- read.csv("../../../_PrivateFolder_datagotchi_federal_2025/data/clustering/qc2022/03_pilot1_clusters_desc.csv")
df_pilot1_2022_with_issues <- readRDS("../../../_PrivateFolder_datagotchi_federal_2025/data/clustering/qc2022/datagotchiQuebec2022_pilote1Clean.rds")


In [22]:
df_long <-cluster_means %>%
  pivot_longer(cols = -cluster, names_to = "variable", values_to = "mean")

ggplot(df_long, aes(x = variable, y = mean, fill = as.factor(cluster))) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Comparaison des moyennes par cluster", x = "Variable", y = "Moyenne", fill = "Cluster")

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/clusterMeans_barplot.pdf",  width = 16, height = 12)


In [23]:
ggplot(df_long, aes(x = cluster, y = variable, fill = mean)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red") +
  theme_minimal() +
  labs(title = "Heatmap des moyennes par cluster", x = "Cluster", y = "Variable", fill = "Moyenne")

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/clusterMeans_heatmap.pdf",  width = 16, height = 12)

In [24]:
# Barplot des moyennes sans les SES

df_filtered <- cluster_means %>%
  select(-matches("^ses_"), -female, -male, -educ, -age, -langFr, -langEn, -immigrant)


df_long_filtered <- df_filtered %>%
  pivot_longer(cols = -cluster, names_to = "variable", values_to = "mean")

ggplot(df_long_filtered, aes(x = variable, y = mean, fill = as.factor(cluster))) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Comparaison des moyennes par cluster (sans les ses_)", 
       x = "Variable", 
       y = "Moyenne", 
       fill = "Cluster")

       ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/clusterMeans_barplotFiltered.pdf",  width = 16, height = 12)

In [25]:
# Heatmap sans les SES

ggplot(df_long_filtered, aes(x = cluster, y = variable, fill = mean)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red") +
  theme_minimal() +
  labs(title = "Heatmap des moyennes par cluster", x = "Cluster", y = "Variable", fill = "Moyenne")

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/clusterMeans_heatmapFiltered.pdf",  width = 16, height = 12)

In [26]:
# Moyenne de certaines SES par cluster
# Age par cluster ( 0 = 34 et moins, 0.5 = 35-54, 1 = 55+)

# Revenu par cluster (0.14 = moins de 30k, 0.28 = 31-60k, 0.43 = 61-90k, 0.57 = 91-110k,
# 0.71 = 111-150k, 0.86 = 151-200k, 1 = 200k+)

# Education ( 0 = BHS, 0.5 = college, 1 = univ)


NULL

In [27]:
# Combien de personnes par cluster

table(df_pilot1_2022$cluster_name)



 Charlie  Jacques Jean-Guy    Julie 
     139      165      389      502 
   Karim  Mélanie 
     124      181 

In [28]:
# Enlever toutes les variables sauf id et enjeux

Df_pilot1_issues <- df_pilot1_2022_with_issues |> 
  select(id, starts_with("issue_"))

In [29]:
# Bind df avec enjeux au pilote utilisé auparavant

Df_validation <- merge(df_pilot1_2022, Df_pilot1_issues, by = "id")

In [34]:
# Validation sur les enjeux

ggplot(Df_validation, aes(x = issue_systemicRacismExistQc, fill = factor(cluster_name))) +
  geom_histogram(bins = 9) +
  facet_wrap(~ cluster_name) +
  labs(
    x = "issue_systemicRacismExistQc",
    y = "Count",
    fill = "Cluster",
    title = "Distribution of issue_systemicRacismExistQc by Cluster"
  ) +
  theme_minimal()

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/validationCluster_systemicRacism.pdf",  width = 16, height = 12)

# Immigration

ggplot(Df_validation, aes(x = issue_proImmigration, fill = factor(cluster_name))) +
  geom_histogram(bins = 9) +
  facet_wrap(~ cluster_name) +
  labs(
    x = "issue_proImmigration",
    y = "Count",
    fill = "Cluster",
    title = "Distribution of issue_proImmigration by Cluster"
  ) +
  theme_minimal()

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/validationCluster_proImmigration.pdf",  width = 16, height = 12)

# issue_reduceDeficit

ggplot(Df_validation, aes(x = issue_reduceDeficit, fill = factor(cluster_name))) +
  geom_histogram(bins = 9) +
  facet_wrap(~ cluster_name) +
  labs(
    x = "issue_reduceDeficit",
    y = "Count",
    fill = "Cluster",
    title = "Distribution of issue_reduceDeficit by Cluster"
  ) +
  theme_minimal()

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/validationCluster_reduceDeficit.pdf",  width = 16, height = 12)

ggplot(Df_validation, aes(x = issue_proPrivateHealth, fill = factor(cluster_name))) +
  geom_histogram(bins = 9) +
  facet_wrap(~ cluster_name) +
  labs(
    x = "issue_proPrivateHealth",
    y = "Count",
    fill = "Cluster",
    title = "Distribution of issue_proPrivateHealth by Cluster"
  ) +
  theme_minimal()

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/validationCluster_proPrivateHealth.pdf",  width = 16, height = 12)

ggplot(Df_validation, aes(x = issue_protectFrench, fill = factor(cluster_name))) +
  geom_histogram(bins = 9) +
  facet_wrap(~ cluster_name) +
  labs(
    x = "issue_protectFrench",
    y = "Count",
    fill = "Cluster",
    title = "Distribution of issue_protectFrench by Cluster"
  ) +
  theme_minimal()

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/validationCluster_protectFrench.pdf",  width = 16, height = 12)

ggplot(Df_validation, aes(x = issue_proQcIndependance, fill = factor(cluster_name))) +
  geom_histogram(bins = 9) +
  facet_wrap(~ cluster_name) +
  labs(
    x = "issue_proQcIndependance",
    y = "Count",
    fill = "Cluster",
    title = "Distribution of issue_proQcIndependance by Cluster"
  ) +
  theme_minimal()

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/validationCluster_proQcIndependance.pdf",  width = 16, height = 12)


In [35]:
# Validation suite

ggplot(Df_validation, aes(x = issue_proTeachersReligiousSymbols, fill = factor(cluster_name))) +
  geom_histogram(bins = 9) +
  facet_wrap(~ cluster_name) +
  labs(
    x = "issue_proTeachersReligiousSymbols",
    y = "Count",
    fill = "Cluster",
    title = "Distribution of issue_proTeachersReligiousSymbols by Cluster"
  ) +
  theme_minimal()

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/validationCluster_proTeachersReligiousSymbols.pdf",  width = 16, height = 12)

ggplot(Df_validation, aes(x = issue_proStrictEnviroRules, fill = factor(cluster_name))) +
  geom_histogram(bins = 9) +
  facet_wrap(~ cluster_name) +
  labs(
    x = "issue_proStrictEnviroRules",
    y = "Count",
    fill = "Cluster",
    title = "Distribution of issue_proStrictEnviroRules by Cluster"
  ) +
  theme_minimal()

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/validationCluster_proStrictEnviroRules.pdf",  width = 16, height = 12)

ggplot(Df_validation, aes(x = issue_QcTooPolCorrect, fill = factor(cluster_name))) +
  geom_histogram(bins = 9) +
  facet_wrap(~ cluster_name) +
  labs(
    x = "issue_QcTooPolCorrect",
    y = "Count",
    fill = "Cluster",
    title = "Distribution of issue_QcTooPolCorrect by Cluster"
  ) +
  theme_minimal()

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/validationCluster_QcTooPolCorrect.pdf",  width = 16, height = 12)

ggplot(Df_validation, aes(x = issue_abortionMoreAccessibleQc, fill = factor(cluster_name))) +
  geom_histogram(bins = 9) +
  facet_wrap(~ cluster_name) +
  labs(
    x = "issue_abortionMoreAccessibleQc",
    y = "Count",
    fill = "Cluster",
    title = "Distribution of issue_abortionMoreAccessibleQc by Cluster"
  ) +
  theme_minimal()

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/validationCluster_abortionMoreAccessibleQc.pdf",  width = 16, height = 12)

ggplot(Df_validation, aes(x = issue_begin3eLienImportant, fill = factor(cluster_name))) +
  geom_histogram(bins = 9) +
  facet_wrap(~ cluster_name) +
  labs(
    x = "issue_begin3eLienImportant",
    y = "Count",
    fill = "Cluster",
    title = "Distribution of issue_begin3eLienImportant by Cluster"
  ) +
  theme_minimal()

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/validationCluster_begin3eLienImportant.pdf",  width = 16, height = 12)

ggplot(Df_validation, aes(x = issue_respectGHGReductionImportant, fill = factor(cluster_name))) +
  geom_histogram(bins = 9) +
  facet_wrap(~ cluster_name) +
  labs(
    x = "issue_respectGHGReductionImportant",
    y = "Count",
    fill = "Cluster",
    title = "Distribution of issue_respectGHGReductionImportant by Cluster"
  ) +
  theme_minimal()

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/validationCluster_respectGHGReductionImportant.pdf",  width = 16, height = 12)
