In [10]:
# Library
library(ggplot2)
library(dplyr)

# Load Data and Clustering Results
dfLifestyle <- readRDS("../../_SharedFolder_datagotchi_federal_2024/data/pilote/dataClean/datagotchi2025_canada_pilotClustering_20250310.rds")
dfUsedForClustering <- readRDS("../../_PrivateFolder_datagotchi_federal_2025/data/clustering/can2025/03_pilot_2025.rds")

In [19]:
# Join both datasets to have cluster, lifestyle and id

# Sélectionner les variables lifestyle et SES de dfLifestyle
dfLifestyle_selected <- dfLifestyle %>%
  select(id, lifestyle_exercise, lifestyle_goFishingFreq_bin, lifestyle_goHuntingFreq_bin, lifestyle_goMuseumsFreq_bin, lifestyle_motorizedActFreq_bin, lifestyle_unmotorizedActFreq_bin, lifestyle_volunteeringFreq_bin, lifestyle_typeTransport, lifestyle_consClothes, lifestyle_eatMeatFreq, lifestyle_fridgeVegetalMilk, lifestyle_fridgeTofuTempeh, lifestyle_fridgeOrganicVeggies, lifestyle_consCoffee, lifestyle_ownPet, lifestyle_favAlcool, lifestyle_hasTattoos, dv_attitudeLeftvsRight, ses_gender, ses_age_4Cat, ses_educ_3Cat, ses_income3Cat, ses_dwelling, dv_voteChoice)

# Sélectionner les variables clusters de dfUsedForClustering
dfClusters_selected <- dfUsedForClustering %>%
  select(id, cluster_name, starts_with("cluster_"))

# Fusionner les deux jeux de données sur id
dfValidation <- dfClusters_selected %>%
  inner_join(dfLifestyle_selected, by = "id")

# Voir les variables de ce jeu fusionné
names(dfValidation)

In [20]:
generate_graph <- function(data, lifestyle_variable, output_path) {
  
  # Convertir la variable en symbole pour dplyr et ggplot
  lifestyle_sym <- sym(lifestyle_variable)

  # Calculer proportions et créer le graphique
  graph <- data %>%
    group_by(cluster_name, !!lifestyle_sym) %>%
    summarise(nombre_de_repondants = n(), .groups = "drop") %>%
    group_by(cluster_name) %>%
    mutate(
      nombre_total_de_repondants = sum(nombre_de_repondants),
      proportion = nombre_de_repondants / nombre_total_de_repondants
    ) %>%
    ggplot(aes(x = !!lifestyle_sym, y = proportion, fill = factor(cluster_name))) +
    geom_col() +
    facet_wrap(~ cluster_name) +
    labs(
      x = lifestyle_variable,  
      y = "Proportion",
      fill = "Cluster",
      title = paste("Distribution of", lifestyle_variable, "by Cluster")
    ) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotation à 45° des labels de l'axe x
  
  # Sauvegarder le graphique
  ggsave(
    filename = paste0(output_path, "/validationCluster_", lifestyle_variable, ".pdf"),
    plot = graph,
    width = 16,
    height = 12
  )
}


In [21]:
# Lister toutes les variables de lifestyle

lifestyle_list <- colnames(dfValidation)[grepl("^lifestyle_", colnames(dfValidation))]
ses_list <- colnames(dfValidation)[grepl("^ses_", colnames(dfValidation))]
dv_list <- colnames(dfValidation)[grepl("^dv_", colnames(dfValidation))]
variable_list <- c(lifestyle_list, ses_list, dv_list)

In [None]:
# Lancer une boucle pour générer tous les graphiques

# Spécifier le dossier où enregistrer les graphiques
output_path <- "../../_PrivateFolder_datagotchi_federal_2025/graph/clustering/can_2025/lifestyle"

# Générer un graphique pour chaque enjeu sélectionné
lapply(variable_list, function(lifestyle) {
  generate_graph(dfValidation, lifestyle, output_path)
})
