# Préparation des variables de ses

In [None]:
library(tidyverse)
library(factoextra)

# Charger les données
DataPilot <- readRDS("../../_PrivateFolder_datagotchi_federal_2025/data/clustering/can2025/01_pilot_2025.rds")

# Charger le fichier des variables
source("01-1_variables.R")

df_ses <- DataPilot %>%
  select(all_of(variables_ses))

In [None]:
# Boucle pour afficher la table pour chaque variable
for (v in variables_ses) {
  if (v != "id") {
    cat("Table pour la variable:", v, "\n")
    print(table(df_ses[[v]]))
    cat("\n\n")
  }
}

### Transformations

Les variables age, éducation et income devraient être des variables ordinales

In [None]:
# Transformation de âge en variable ordinale

df_ses <- df_ses %>%
  mutate(
    ses_age = case_when(
      ses_ageGroup5Years %in% c("18_19", "20_24", "25_29", "30_34") ~ 0,
      ses_ageGroup5Years %in% c("35_39", "40_44", "45_49", "50_54") ~ 0.5,
      ses_ageGroup5Years %in% c("55_59", "60_64", "65_69", "70_74",
                                "75_79", "80_84", "85_89", "90_94",
                                "95_99", "100+") ~ 1,
      TRUE ~ NA_real_
    ) 
  ) %>%
  select(
    -ses_ageGroup5Years,
    -ses_age_4Cat
  )


In [None]:
# Transformation de educ en variable ordinale
#--------------------------------------------

df_ses <- df_ses %>%
  mutate(
    ses_educ = case_when(
      ses_educ_3Cat == "educBHS" ~ 0,
      ses_educ_3Cat == "educPostHS" ~ 0.5,
      TRUE ~ 1
    )
  ) %>%
  select(-ses_educ_3Cat)


In [None]:
# Transformation de income en variable ordinale
#----------------------------------------------

df_ses <- df_ses %>%
  mutate(
    ses_income = case_when(
      ses_income3Cat == "Low" ~ 0,
      ses_income3Cat == "Mid" ~ 0.5,
      TRUE ~ 1
    )
  ) %>%
  select(-ses_income3Cat)


In [None]:
# Binariser ses_language
#-----------------------------------------

df_ses$ses_languageEnglish <- NA
df_ses$ses_languageEnglish[df_ses$ses_language == 'english'] <- 1
df_ses$ses_languageEnglish[df_ses$ses_language != 'english'] <- 0
table(df_ses$ses_languageEnglish)

df_ses$ses_languageFrench <- NA
df_ses$ses_languageFrench[df_ses$ses_language == 'french'] <- 1
df_ses$ses_languageFrench[df_ses$ses_language != 'french'] <- 0
table(df_ses$ses_languageFrench)

df_ses$ses_languageOther <- NA
df_ses$ses_languageOther[df_ses$ses_language == 'other'] <- 1
df_ses$ses_languageOther[df_ses$ses_language != 'other'] <- 0
table(df_ses$ses_languageOther)

In [None]:
# Binariser ses_ethnicityWB
#-----------------------------------------

df_ses$ses_ethnicityWhite <- NA
df_ses$ses_ethnicityWhite[df_ses$ses_ethnicityWB == 'white'] <- 1
df_ses$ses_ethnicityWhite[df_ses$ses_ethnicityWB != 'white'] <- 0
table(df_ses$ses_ethnicityWhite)

df_ses$ses_ethnicityBlack <- NA
df_ses$ses_ethnicityBlack[df_ses$ses_ethnicityWB == 'black'] <- 1
df_ses$ses_ethnicityBlack[df_ses$ses_ethnicityWB != 'black'] <- 0
table(df_ses$ses_ethnicityBlack)

df_ses$ses_ethnicityOther <- NA
df_ses$ses_ethnicityOther[df_ses$ses_ethnicityWB == 'other'] <- 1
df_ses$ses_ethnicityOther[df_ses$ses_ethnicityWB != 'other'] <- 0
table(df_ses$ses_ethnicityOther)


In [None]:
table(df_ses$ses_sexOrientation)

In [None]:
# Binariser ses_sexOrientation
#-----------------------------------------

df_ses$ses_sexOrientationHetero <- NA
df_ses$ses_sexOrientationHetero[df_ses$ses_sexOrientation == 'heterosexual'] <- 1
df_ses$ses_sexOrientationHetero[df_ses$ses_sexOrientation != 'heterosexual'] <- 0
table(df_ses$ses_sexOrientationHetero)

df_ses$ses_sexOrientationQueer <- NA
df_ses$ses_sexOrientationQueer[df_ses$ses_sexOrientation %in% c('gay', 'bisexual', 'other')] <- 1
df_ses$ses_sexOrientationQueer[df_ses$ses_sexOrientation == 'heterosexual'] <- 0
table(df_ses$ses_sexOrientationQueer)

In [None]:
variables_ses_clust <- c(
  "id",
  "ses_genderMale",
  "ses_age",
  "ses_languageEnglish",
  "ses_languageFrench",
  "ses_languageOther",
  "ses_educ",
  "ses_income",
  "ses_immigrant",
  "ses_ethnicityWhite",
  "ses_ethnicityBlack",
  "ses_ethnicityOther",
  "ses_sexOrientationHetero",
  "ses_sexOrientationQueer"
)

In [None]:
# Boucle pour afficher la table pour chaque variable
for (v in variables_ses_clust) {
  if (v != "id") {
    cat("Table pour la variable:", v, "\n")
    print(table(df_ses[[v]]))
    cat("\n\n")
  }
}

In [None]:
# Sauvegarder les données préparées
saveRDS(df_ses, file = "../../_PrivateFolder_datagotchi_federal_2025/data/clustering/can2025/02_pilot_2025_ses.rds")

---

# SUGGESTION : Activité de Clustering

Si souhaité, effectuer un exercice de clustering avec ce sous-groupe de variables pour mieux saisir les données et leur relation. 

Cet exercice est purement exploratoire et sert à voir si les données peuvent être utilisées pour faire du clustering.