# Préparation des variables de ses

In [34]:
library(tidyverse)
library(factoextra)

# Charger les données
DataPilot <- readRDS("../../_PrivateFolder_datagotchi_federal_2025/data/clustering/can2025/01_pilot_2025.rds")

# Charger le fichier des variables
source("01-1_variables.R")

df_ses <- DataPilot %>%
  select(all_of(variables_ses))

In [35]:
# Boucle pour afficher la table pour chaque variable
for (v in variables_ses) {
  if (v != "id") {
    cat("Table pour la variable:", v, "\n")
    print(table(df_ses[[v]]))
    cat("\n\n")
  }
}

Table pour la variable: ses_genderMale 

  0   1 
493 539 


Table pour la variable: ses_genderFemale 

  0   1 
551 481 


Table pour la variable: ses_gender 

    agender      female        male  non_binary       queer   trans_man 
          0         481         539           5           2           3 
trans_woman 
          2 


Table pour la variable: ses_ageGroup5Years 

18_19 20_24 25_29 30_34 35_39 40_44 45_49 50_54 55_59 60_64 65_69 70_74 75_79 
   13    66   113    93   110     0    84    85    99   103    92    71    67 
80_84 85_89 90_94 95_99  100+ 
   22    10     3     1     0 


Table pour la variable: ses_age_4Cat 

18_24 25_44 45_64   65+ 
   79   316   371   266 


Table pour la variable: ses_region 

         prairie british_columbia         atlantic          ontario 
             173              137               39              337 
          quebec      territories 
             342                4 


Table pour la variable: ses_language 

english  french   oth

### Transformations

Les variables age, éducation et income devraient être des variables ordinales

In [36]:
# Transformation de âge en variable ordinale

df_ses <- df_ses %>%
  mutate(
    ses_age = case_when(
      ses_ageGroup5Years %in% c("18_19", "20_24", "25_29", "30_34") ~ 0,
      ses_ageGroup5Years %in% c("35_39", "40_44", "45_49", "50_54") ~ 0.5,
      ses_ageGroup5Years %in% c("55_59", "60_64", "65_69", "70_74",
                                "75_79", "80_84", "85_89", "90_94",
                                "95_99", "100+") ~ 1,
      TRUE ~ NA_real_
    ) 
  ) %>%
  select(
    -ses_ageGroup5Years,
    -ses_age_4Cat
  )


In [37]:
# Transformation de educ en variable ordinale
#--------------------------------------------

df_ses$ses_educBHS <- NA
df_ses$ses_educBHS[df_ses$ses_educ_3Cat == 'educBHS'] <- 1
df_ses$ses_educBHS[df_ses$ses_educ_3Cat != 'educBHS'] <- 0
table(df_ses$ses_educBHS)

df_ses$ses_educPostHS <- NA
df_ses$ses_educPostHS[df_ses$ses_educ_3Cat == 'educPostHS'] <- 1
df_ses$ses_educPostHS[df_ses$ses_educ_3Cat != 'educPostHS'] <- 0
table(df_ses$ses_educPostHS)

df_ses$ses_educUniv <- NA
df_ses$ses_educUniv[df_ses$ses_educ_3Cat == 'educUniv'] <- 1
df_ses$ses_educUniv[df_ses$ses_educ_3Cat != 'educUniv'] <- 0
table(df_ses$ses_educUniv)



  0   1 
814 218 


  0   1 
697 335 


  0   1 
553 479 

In [38]:
# Transformation de income en variable ordinale
#----------------------------------------------
df_ses$income_no_income <- ifelse(df_ses$ses_income == "no_income", 1, 0)
df_ses$income_1_30000   <- ifelse(df_ses$ses_income == "1_to_30000", 1, 0)
df_ses$income_30001_60000 <- ifelse(df_ses$ses_income == "30001_to_60000", 1, 0)
df_ses$income_60001_90000 <- ifelse(df_ses$ses_income == "60001_to_90000", 1, 0)
df_ses$income_90001_110000 <- ifelse(df_ses$ses_income == "90001_to_110000", 1, 0)
df_ses$income_110001_150000 <- ifelse(df_ses$ses_income == "110001_to_150000", 1, 0)
df_ses$income_150001_200000 <- ifelse(df_ses$ses_income == "150001_to_200000", 1, 0)
df_ses$income_more_than_200000 <- ifelse(df_ses$ses_income == "more_than_200000", 1, 0)

table(df_ses$income_no_income)
table(df_ses$income_1_30000)
table(df_ses$income_30001_60000)
table(df_ses$income_60001_90000)
table(df_ses$income_90001_110000)
table(df_ses$income_110001_150000)
table(df_ses$income_150001_200000)
table(df_ses$income_more_than_200000)


   0    1 
1013   19 


  0   1 
904 128 


  0   1 
817 215 


  0   1 
806 226 


  0   1 
911 121 


  0   1 
861 171 


  0   1 
936  96 


  0   1 
976  56 

In [39]:
# Binariser ses_language
#-----------------------------------------

df_ses$ses_languageEnglish <- NA
df_ses$ses_languageEnglish[df_ses$ses_language == 'english'] <- 1
df_ses$ses_languageEnglish[df_ses$ses_language != 'english'] <- 0
table(df_ses$ses_languageEnglish)

df_ses$ses_languageFrench <- NA
df_ses$ses_languageFrench[df_ses$ses_language == 'french'] <- 1
df_ses$ses_languageFrench[df_ses$ses_language != 'french'] <- 0
table(df_ses$ses_languageFrench)

df_ses$ses_languageOther <- NA
df_ses$ses_languageOther[df_ses$ses_language == 'other'] <- 1
df_ses$ses_languageOther[df_ses$ses_language != 'other'] <- 0
table(df_ses$ses_languageOther)


  0   1 
352 680 


  0   1 
724 308 


  0   1 
988  44 

In [40]:
# Binariser ses_ethnicityWB
#-----------------------------------------

df_ses$ses_ethnicityWhite <- NA
df_ses$ses_ethnicityWhite[df_ses$ses_ethnicityWB == 'white'] <- 1
df_ses$ses_ethnicityWhite[df_ses$ses_ethnicityWB != 'white'] <- 0
table(df_ses$ses_ethnicityWhite)

df_ses$ses_ethnicityBlack <- NA
df_ses$ses_ethnicityBlack[df_ses$ses_ethnicityWB == 'black'] <- 1
df_ses$ses_ethnicityBlack[df_ses$ses_ethnicityWB != 'black'] <- 0
table(df_ses$ses_ethnicityBlack)

df_ses$ses_ethnicityOther <- NA
df_ses$ses_ethnicityOther[df_ses$ses_ethnicityWB == 'other'] <- 1
df_ses$ses_ethnicityOther[df_ses$ses_ethnicityWB != 'other'] <- 0
table(df_ses$ses_ethnicityOther)



  0   1 
220 812 


   0    1 
1006   26 


  0   1 
838 194 

In [41]:
table(df_ses$ses_sexOrientation)


heterosexual          gay     bisexual        other 
         899           58           41           34 

In [42]:
# Binariser ses_sexOrientation
#-----------------------------------------

df_ses$ses_sexOrientationHetero <- NA
df_ses$ses_sexOrientationHetero[df_ses$ses_sexOrientation == 'heterosexual'] <- 1
df_ses$ses_sexOrientationHetero[df_ses$ses_sexOrientation != 'heterosexual'] <- 0
table(df_ses$ses_sexOrientationHetero)

df_ses$ses_sexOrientationQueer <- NA
df_ses$ses_sexOrientationQueer[df_ses$ses_sexOrientation %in% c('gay', 'bisexual', 'other')] <- 1
df_ses$ses_sexOrientationQueer[df_ses$ses_sexOrientation == 'heterosexual'] <- 0
table(df_ses$ses_sexOrientationQueer)


  0   1 
133 899 


  0   1 
899 133 

In [43]:
variables_ses_clust <- c(
  "id",
  "ses_genderMale",
  "ses_age",
  "ses_languageEnglish",
  "ses_languageFrench",
  "ses_languageOther",
  "ses_educUniv",
  "ses_educBHS",
  "ses_educPostHS",
  "ses_immigrant",
  "ses_ethnicityWhite",
  "ses_ethnicityBlack",
  "ses_ethnicityOther",
  "ses_sexOrientationHetero",
  "ses_sexOrientationQueer",
  "income_no_income",
  "income_1_30000",
  "income_30001_60000",
  "income_60001_90000",
  "income_90001_110000",
  "income_110001_150000",
  "income_150001_200000",
  "income_more_than_200000"
)

In [44]:
# Boucle pour afficher la table pour chaque variable
for (v in variables_ses_clust) {
  if (v != "id") {
    cat("Table pour la variable:", v, "\n")
    print(table(df_ses[[v]]))
    cat("\n\n")
  }
}

Table pour la variable: ses_genderMale 

  0   1 
493 539 


Table pour la variable: ses_age 

  0 0.5   1 
285 279 468 


Table pour la variable: ses_languageEnglish 

  0   1 
352 680 


Table pour la variable: ses_languageFrench 

  0   1 
724 308 


Table pour la variable: ses_languageOther 

  0   1 
988  44 


Table pour la variable: ses_educUniv 

  0   1 
553 479 


Table pour la variable: ses_educBHS 

  0   1 
814 218 


Table pour la variable: ses_educPostHS 

  0   1 
697 335 


Table pour la variable: ses_immigrant 

  0   1 
195 837 


Table pour la variable: ses_ethnicityWhite 

  0   1 
220 812 


Table pour la variable: ses_ethnicityBlack 

   0    1 
1006   26 


Table pour la variable: ses_ethnicityOther 

  0   1 
838 194 


Table pour la variable: ses_sexOrientationHetero 

  0   1 
133 899 


Table pour la variable: ses_sexOrientationQueer 

  0   1 
899 133 


Table pour la variable: income_no_income 

   0    1 
1013   19 


Table pour la variable: income_1_3000

In [45]:
# Sauvegarder les données préparées
saveRDS(df_ses, file = "../../_PrivateFolder_datagotchi_federal_2025/data/clustering/can2025/02_pilot_2025_ses.rds")

---

# SUGGESTION : Activité de Clustering

Si souhaité, effectuer un exercice de clustering avec ce sous-groupe de variables pour mieux saisir les données et leur relation. 

Cet exercice est purement exploratoire et sert à voir si les données peuvent être utilisées pour faire du clustering.