# Préparation des variables de ses

In [1]:
library(tidyverse)
library(factoextra)

# Charger les données
DataPilot <- readRDS("../../_PrivateFolder_datagotchi_federal_2025/data/clustering/can2025/01_pilot_2025.rds")

# Charger le fichier des variables
source("01-1_variables.R")

df_ses <- DataPilot %>%
  select(all_of(variables_ses))

-- [1mAttaching core tidyverse packages[22m ------------------------ tidyverse 2.0.0 --
[32mv[39m [34mdplyr    [39m 1.1.4     [32mv[39m [34mreadr    [39m 2.1.5
[32mv[39m [34mforcats  [39m 1.0.0     [32mv[39m [34mstringr  [39m 1.5.1
[32mv[39m [34mggplot2  [39m 3.5.1     [32mv[39m [34mtibble   [39m 3.2.1
[32mv[39m [34mlubridate[39m 1.9.3     [32mv[39m [34mtidyr    [39m 1.3.1
[32mv[39m [34mpurrr    [39m 1.0.2     
-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mi[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa



In [2]:
# Boucle pour afficher la table pour chaque variable
for (v in variables_ses) {
  if (v != "id") {
    cat("Table pour la variable:", v, "\n")
    print(table(df_ses[[v]]))
    cat("\n\n")
  }
}

Table pour la variable: ses_genderMale 

   0    1 
1490 1744 


Table pour la variable: ses_genderFemale 

   0    1 
1812 1422 


Table pour la variable: ses_gender 

    agender      female        male  non_binary       queer   trans_man 
          3        1422        1744          18           8          22 
trans_woman 
          7 


Table pour la variable: ses_ageGroup5Years 

18_19 20_24 25_29 30_34 35_39 40_44 45_49 50_54 55_59 60_64 65_69 70_74 75_79 
  155   459   308   245   223     0   180   208   197   216   189   161   150 
80_84 85_89 90_94 95_99  100+ 
   61    24     7     1     0 


Table pour la variable: ses_age_4Cat 

18_24 25_44 45_64   65+ 
  524   959   801   593 


Table pour la variable: ses_region 

         prairie british_columbia         atlantic          ontario 
             423              359              111              822 
          quebec      territories 
             606               18 


Table pour la variable: ses_language 

english  fren

### Transformations

Les variables age, éducation et income devraient être des variables ordinales

In [None]:
# Transformation de age en variable ordinale
#-------------------------------------------

df_ses <- df_ses %>%

  mutate(
    age = if_else(
      age34m == 1,
      0,
      if_else(
        age3554 == 1,
        0.5,
        1
      )
    )
  ) %>%
  select(
    -age34m,
    -age3554,
    -age55p
  )

In [None]:
# Transformation de educ en variable ordinale
#--------------------------------------------

df_ses <- df_ses %>%

  mutate(
    educ = if_else(
      educBHS == 1,
      0,
      if_else(
        educCollege == 1,
        0.5,
        1
      )
    )
  ) %>%
  select(
    -educBHS,
    -educCollege,
    -educUniv
  )

In [None]:
# Transformation de income en variable ordinale
#----------------------------------------------

df_ses <- df_ses %>%

  mutate(
    ses_income_None = 0,
    ses_income_i1to30    = ses_income_i1to30/7,
    ses_income_i31to60   = (1*ses_income_i31to60)/7 + ses_income_i31to60/7,
    ses_income_i61to90   = (2*ses_income_i61to90)/7 + ses_income_i61to90/7,
    ses_income_i91to110  = (3*ses_income_i91to110)/7 + ses_income_i91to110/7,
    ses_income_i111to150 = (4*ses_income_i111to150)/7 + ses_income_i111to150/7,
    ses_income_i151to200 = (5*ses_income_i151to200)/7 + ses_income_i151to200/7,
    ses_income_i201toInf = (ses_income_i201toInf)
  ) %>%
  mutate (
    ses_income = ses_income_None +
      ses_income_i31to60 +
      ses_income_i61to90 +
      ses_income_i91to110 +
      ses_income_i111to150 +
      ses_income_i151to200 +
      ses_income_i201toInf
  ) %>%
  select(
    -ses_income_None,
    -ses_income_i201toInf,
    -ses_income_i31to60,
    -ses_income_i61to90,
    -ses_income_i91to110,
    -ses_income_i111to150,
    -ses_income_i151to200,
    -ses_income_i201toInf
  )

In [None]:
variables_ses_clust <- c(
  "id",
  "male",
  "female",
  "age",
  "langEn",
  "langFr",
  "ses_languageOther",
  "educ",
  "ses_income",
  "immigrant",
  "ses_ethn_White",
  "ses_ethn_Black",
  "ses_ethn_Other",
  "ses_hetero",
  "ses_gai",
  "ses_bisex",
  "ses_sexOri_other"
)

In [None]:
# Boucle pour afficher la table pour chaque variable
for (v in variables_ses_clust) {
  if (v != "id") {
    cat("Table pour la variable:", v, "\n")
    print(table(df_ses[[v]]))
    cat("\n\n")
  }
}

In [None]:
# Sauvegarder les données préparées
saveRDS(df_ses, file = "../../_PrivateFolder_datagotchi_federal_2025/data/clustering/qc2022/02_pilot1_2022_ses.rds")

---

# SUGGESTION : Activité de Clustering

Si souhaité, effectuer un exercice de clustering avec ce sous-groupe de variables pour mieux saisir les données et leur relation. 

Cet exercice est purement exploratoire et sert à voir si les données peuvent être utilisées pour faire du clustering.