In [12]:
# Library ----------------------------------------------------------------
library(dplyr)
library(tidyr)
library(ggplot2)
library(sondr)
library(clessnize)

# 1. Load Data and Clustering Results ---------------------------
kmeans_result <- readRDS("../../../_PrivateFolder_datagotchi_federal_2025/data/clustering/qc2022/03_pilot1_kmeans.rds")
df_pilot1_2022 <- readRDS("../../../_PrivateFolder_datagotchi_federal_2025/data/clustering/qc2022/03_pilot1_2022.rds")
cluster_means <- read.csv("../../../_PrivateFolder_datagotchi_federal_2025/data/clustering/qc2022/03_pilot1_clusters_desc.csv")
df_pilot1_2022_with_issues <- readRDS("../../../_PrivateFolder_datagotchi_federal_2025/data/clustering/qc2022/datagotchiQuebec2022_pilote1Clean.rds")


In [13]:
 
# Cluster's Visualization ------------------------------------------------

df_long <-cluster_means %>%
  pivot_longer(cols = -cluster, names_to = "variable", values_to = "mean")

ggplot(df_long, aes(x = variable, y = mean, fill = as.factor(cluster))) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Comparaison des moyennes par cluster", x = "Variable", y = "Moyenne", fill = "Cluster")

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/clusterMeans_barplot.pdf",  width = 16, height = 12)


In [14]:
ggplot(df_long, aes(x = cluster, y = variable, fill = mean)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red") +
  theme_minimal() +
  labs(title = "Heatmap des moyennes par cluster", x = "Cluster", y = "Variable", fill = "Moyenne")

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/clusterMeans_heatmap.pdf",  width = 16, height = 12)

In [15]:
# Barplot des moyennes sans les SES

df_filtered <- cluster_means %>%
  select(-matches("^ses_"), -female, -male, -educ, -age, -langFr, -langEn, -immigrant)


df_long_filtered <- df_filtered %>%
  pivot_longer(cols = -cluster, names_to = "variable", values_to = "mean")

ggplot(df_long_filtered, aes(x = variable, y = mean, fill = as.factor(cluster))) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Comparaison des moyennes par cluster (sans les ses_)", 
       x = "Variable", 
       y = "Moyenne", 
       fill = "Cluster")

       ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/clusterMeans_barplotFiltered.pdf",  width = 16, height = 12)

In [16]:
# Heatmap sans les SES

ggplot(df_long_filtered, aes(x = cluster, y = variable, fill = mean)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red") +
  theme_minimal() +
  labs(title = "Heatmap des moyennes par cluster", x = "Cluster", y = "Variable", fill = "Moyenne")

ggsave("../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022/clusterMeans_heatmapFiltered.pdf",  width = 16, height = 12)

In [21]:
# Table de fréquence
table_clusters <- table(df_pilot1_2022$cluster_name)

# Calcul des proportions
proportions_clusters <- prop.table(table_clusters) * 100

# Afficher les résultats
table_clusters  # Nombre de personnes par cluster
proportions_clusters  # Proportions en pourcentage



  Charlie   Jacques  Jean-Guy     Julie     Karim   Mélanie 
 9.266667 11.000000 25.933333 33.466667  8.266667 12.066667 

In [None]:
# Enlever toutes les variables sauf id et enjeux

Df_pilot1_issues <- df_pilot1_2022_with_issues |> 
  select(id, starts_with("issue_"))

# Bind df avec enjeux au pilote utilisé auparavant

Df_validation <- merge(df_pilot1_2022, Df_pilot1_issues, by = "id")

In [None]:
# Créer une fonction pour enjeu par cluster (pour ne pas avoir à créer 12 graphiques un par un)

generate_graph <- function(data, issue_variable, output_path) {
  
  # Convertir la variable en symbole
  issue_sym <- ensym(issue_variable)
  
  # Regroupement et génération du graphique
  data |> 
    group_by(cluster_name, !!issue_sym) |> 
    summarise(nombre_de_repondants = n(), .groups = "drop") |> 
    group_by(cluster_name) |> 
    mutate(
      nombre_total_de_repondants = sum(nombre_de_repondants),
      proportion = nombre_de_repondants / nombre_total_de_repondants
    ) |> 
    ggplot(aes(x = !!issue_sym, y = proportion, fill = factor(cluster_name))) +
    geom_col() +
    facet_wrap(~ cluster_name) +
    labs(
      x = as_label(issue_sym),  # Affiche la variable dynamiquement dans le label
      y = "Proportion",
      fill = "Cluster",
      title = paste("Distribution of", as_label(issue_sym), "by Cluster")
    ) +
    theme_clean_light() -> graph
  
  # Sauvegarder le graphique
  ggsave(
    filename = paste0(output_path, "/validationCluster_", as_label(issue_sym), ".pdf"),
    plot = graph,
    width = 16,
    height = 12
  )
}

# Loop pour générer les graphiques

# Utiliser lapply pour créer les graphiques
lapply(variables_issues, function(issue) {
  generate_graph(Df_validation, !!sym(issue), output_path)
})

In [None]:
# Préparer mes variables pour la loop

# Liste des variables d'enjeux
variables_issues <- c(
  "issue_systemicRacismExistQc",
  "issue_proImmigration",
  "issue_reduceDeficit",
  "issue_proPrivateHealth", 
  "issue_protectFrench", 
  "issue_proQcIndependance",
  "issue_proTeachersReligiousSymbols",
  "issue_proStrictEnviroRules",
  "issue_QcTooPolCorrect",
  "issue_abortionMoreAccessibleQc",
  "issue_begin3eLienImportant",
  "issue_respectGHGReductionImportant"
)

# Chemin d’enregistrement des graphiques
output_path <- "../../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/qc_2022"

In [None]:
 
# Créer une fonction par enjeu pour la proportion totale pour meilleure comparaison ----

generate_graph_all <- function(data, issue_variable, output_path) {
  
  # Convertir la variable en symbole
  issue_sym <- ensym(issue_variable)
  
  # Regroupement et génération du graphique
  data |> 
    group_by(!!issue_sym) |> 
    summarise(nombre_de_repondants = n(), .groups = "drop") |> 
    mutate(
      nombre_total_de_repondants = sum(nombre_de_repondants),
      proportion = nombre_de_repondants / nombre_total_de_repondants
    ) |> 
    ggplot(aes(x = !!issue_sym, y = proportion, fill = factor(!!issue_sym))) +
    geom_col(show.legend = FALSE) +
    labs(
      x = as_label(issue_sym),  # Affiche la variable dynamiquement dans le label
      y = "Proportion",
      title = paste("Distribution of", as_label(issue_sym), "among all respondents")
    ) +
    theme_clean_light() -> graph
  
  # Sauvegarder le graphique
  ggsave(
    filename = paste0(output_path, "/distribution_all_", as_label(issue_sym), ".pdf"),
    plot = graph,
    width = 16,
    height = 12
  )
}

# Loop pour générer les graphiques
lapply(variables_issues, function(issue) {
  generate_graph_all(Df_validation, !!sym(issue), output_path)
})