# Préparation des variables de ses

In [2]:
library(tidyverse)
library(factoextra)

# Charger les données
DataPilot <- readRDS("../../_PrivateFolder_datagotchi_federal_2025/data/clustering/can2025/01_pilot_2025.rds")

# Charger le fichier des variables
source("01-1_variables.R")

df_ses <- DataPilot %>%
  select(all_of(variables_ses))

In [3]:
# Boucle pour afficher la table pour chaque variable
for (v in variables_ses) {
  if (v != "id") {
    cat("Table pour la variable:", v, "\n")
    print(table(df_ses[[v]]))
    cat("\n\n")
  }
}

Table pour la variable: ses_genderMale 

  0   1 
484 537 


Table pour la variable: ses_genderFemale 

  0   1 
548 473 


Table pour la variable: ses_gender 

    agender      female        male  non_binary       queer   trans_man 
          0         473         537           5           2           2 
trans_woman 
          2 


Table pour la variable: ses_ageGroup5Years 

18_19 20_24 25_29 30_34 35_39 40_44 45_49 50_54 55_59 60_64 65_69 70_74 75_79 
   13    65   112    91   108     0    82    84    97   103    92    71    67 
80_84 85_89 90_94 95_99  100+ 
   22    10     3     1     0 


Table pour la variable: ses_age_4Cat 

18_24 25_44 45_64   65+ 
   78   311   366   266 


Table pour la variable: ses_region 

         prairie british_columbia         atlantic          ontario 
             173              132               38              332 
          quebec      territories 
             342                4 


Table pour la variable: ses_language 

english  french   oth

### Transformations

Les variables age, éducation et income devraient être des variables ordinales

In [4]:
# Transformation de âge en variable ordinale

df_ses <- df_ses %>%
  mutate(
    ses_age = case_when(
      ses_ageGroup5Years %in% c("18_19", "20_24", "25_29", "30_34") ~ 0,
      ses_ageGroup5Years %in% c("35_39", "40_44", "45_49", "50_54") ~ 0.5,
      ses_ageGroup5Years %in% c("55_59", "60_64", "65_69", "70_74",
                                "75_79", "80_84", "85_89", "90_94",
                                "95_99", "100+") ~ 1,
      TRUE ~ NA_real_
    ) 
  ) %>%
  select(
    -ses_ageGroup5Years,
    -ses_age_4Cat
  )


In [5]:
# Transformation de educ en variable ordinale
#--------------------------------------------

df_ses$ses_educBHS <- NA
df_ses$ses_educBHS[df_ses$ses_educ_3Cat == 'educBHS'] <- 1
df_ses$ses_educBHS[df_ses$ses_educ_3Cat != 'educBHS'] <- 0
table(df_ses$ses_educBHS)

df_ses$ses_educPostHS <- NA
df_ses$ses_educPostHS[df_ses$ses_educ_3Cat == 'educPostHS'] <- 1
df_ses$ses_educPostHS[df_ses$ses_educ_3Cat != 'educPostHS'] <- 0
table(df_ses$ses_educPostHS)

df_ses$ses_educUniv <- NA
df_ses$ses_educUniv[df_ses$ses_educ_3Cat == 'educUniv'] <- 1
df_ses$ses_educUniv[df_ses$ses_educ_3Cat != 'educUniv'] <- 0
table(df_ses$ses_educUniv)


  0   1 
806 215 


  0   1 
690 331 


  0   1 
546 475 

In [6]:
table(df_ses$ses_educ)

< table of extent 0 >

In [7]:
# Transformation de income en variable ordinale
#----------------------------------------------

df_ses$ses_incomeLow <- NA
df_ses$ses_incomeLow[df_ses$ses_income3Cat == 'Low'] <- 1
df_ses$ses_incomeLow[df_ses$ses_income3Cat != 'Low'] <- 0
table(df_ses$ses_incomeLow)

df_ses$ses_incomeMid <- NA
df_ses$ses_incomeMid[df_ses$ses_income3Cat == 'Mid'] <- 1
df_ses$ses_incomeMid[df_ses$ses_income3Cat != 'Mid'] <- 0
table(df_ses$ses_incomeMid)

df_ses$ses_incomeHigh <- NA
df_ses$ses_incomeHigh[df_ses$ses_income3Cat == 'High'] <- 1
df_ses$ses_incomeHigh[df_ses$ses_income3Cat != 'High'] <- 0
table(df_ses$ses_incomeHigh)



  0   1 
880 141 


  0   1 
292 729 


  0   1 
870 151 

In [8]:
# Binariser ses_language
#-----------------------------------------

df_ses$ses_languageEnglish <- NA
df_ses$ses_languageEnglish[df_ses$ses_language == 'english'] <- 1
df_ses$ses_languageEnglish[df_ses$ses_language != 'english'] <- 0
table(df_ses$ses_languageEnglish)

df_ses$ses_languageFrench <- NA
df_ses$ses_languageFrench[df_ses$ses_language == 'french'] <- 1
df_ses$ses_languageFrench[df_ses$ses_language != 'french'] <- 0
table(df_ses$ses_languageFrench)

df_ses$ses_languageOther <- NA
df_ses$ses_languageOther[df_ses$ses_language == 'other'] <- 1
df_ses$ses_languageOther[df_ses$ses_language != 'other'] <- 0
table(df_ses$ses_languageOther)


  0   1 
350 671 


  0   1 
715 306 


  0   1 
977  44 

In [9]:
# Binariser ses_ethnicityWB
#-----------------------------------------

df_ses$ses_ethnicityWhite <- NA
df_ses$ses_ethnicityWhite[df_ses$ses_ethnicityWB == 'white'] <- 1
df_ses$ses_ethnicityWhite[df_ses$ses_ethnicityWB != 'white'] <- 0
table(df_ses$ses_ethnicityWhite)

df_ses$ses_ethnicityMinority <- NA
df_ses$ses_ethnicityMinority[df_ses$ses_ethnicityWB != 'white'] <- 1
df_ses$ses_ethnicityMinority[df_ses$ses_ethnicityWB == 'white'] <- 0
table(df_ses$ses_ethnicityMinority)



  0   1 
212 809 


  0   1 
809 212 

In [10]:
# Binariser ses_sexOrientation
#-----------------------------------------

df_ses$ses_sexOrientationHetero <- NA
df_ses$ses_sexOrientationHetero[df_ses$ses_sexOrientation == 'heterosexual'] <- 1
df_ses$ses_sexOrientationHetero[df_ses$ses_sexOrientation != 'heterosexual'] <- 0
table(df_ses$ses_sexOrientationHetero)

df_ses$ses_sexOrientationQueer <- NA
df_ses$ses_sexOrientationQueer[df_ses$ses_sexOrientation %in% c('gay', 'bisexual', 'other')] <- 1
df_ses$ses_sexOrientationQueer[df_ses$ses_sexOrientation == 'heterosexual'] <- 0
table(df_ses$ses_sexOrientationQueer)


  0   1 
127 894 


  0   1 
894 127 

In [11]:
# Binariser ses_region
#-----------------------------------------

df_ses$ses_regionPrairies <- NA
df_ses$ses_regionPrairies[df_ses$ses_region == "prairie"] <- 1
df_ses$ses_regionPrairies[df_ses$ses_region != "prairie"] <- 0

df_ses$ses_regionBC <- NA
df_ses$ses_regionBC[df_ses$ses_region == "british_columbia"] <- 1
df_ses$ses_regionBC[df_ses$ses_region != "british_columbia"] <- 0

df_ses$ses_regionAtlantic <- NA
df_ses$ses_regionAtlantic[df_ses$ses_region == "atlantic"] <- 1
df_ses$ses_regionAtlantic[df_ses$ses_region != "atlantic"] <- 0

df_ses$ses_regionOntario <- NA
df_ses$ses_regionOntario[df_ses$ses_region == "ontario"] <- 1
df_ses$ses_regionOntario[df_ses$ses_region != "ontario"] <- 0

df_ses$ses_regionQuebec <- NA
df_ses$ses_regionQuebec[df_ses$ses_region == "quebec"] <- 1
df_ses$ses_regionQuebec[df_ses$ses_region != "quebec"] <- 0

df_ses$ses_regionTerritories <- NA
df_ses$ses_regionTerritories[df_ses$ses_region == "territories"] <- 1
df_ses$ses_regionTerritories[df_ses$ses_region != "territories"] <- 0

In [12]:
# Regrouper ses_region
table(df_ses$ses_region)
df_ses$ses_regionWest <- NA
df_ses$ses_regionWest[df_ses$ses_region %in% c("prairie", "british_columbia")] <- 1
df_ses$ses_regionWest[df_ses$ses_region %in% c("atlantic", "ontario", "quebec", "territories")] <- 0
table(df_ses$ses_regionWest)


         prairie british_columbia         atlantic          ontario 
             173              132               38              332 
          quebec      territories 
             342                4 


  0   1 
716 305 

In [13]:
variables_ses_clust <- c(
  "id",
  "ses_genderMale",
  "ses_age",
  "ses_languageEnglish",
  "ses_languageFrench",
  "ses_languageOther",
  "ses_educBHS",
  "ses_educPostHS",
  "ses_educUniv",
  "ses_incomeLow",
  "ses_incomeMid",
  "ses_incomeHigh",
  "ses_immigrant",
  "ses_ethnicityWhite",
  "ses_ethnicityMinority",
  "ses_sexOrientationHetero",
  "ses_sexOrientationQueer",
  "ses_regionPrairies",
  "ses_regionBC",
  "ses_regionAtlantic",
  "ses_regionOntario",
  "ses_regionQuebec",
  "ses_regionTerritories",
  "ses_regionWest"
)

In [14]:
# Boucle pour afficher la table pour chaque variable
for (v in variables_ses_clust) {
  if (v != "id") {
    cat("Table pour la variable:", v, "\n")
    print(table(df_ses[[v]]))
    cat("\n\n")
  }
}

Table pour la variable: ses_genderMale 

  0   1 
484 537 


Table pour la variable: ses_age 

  0 0.5   1 
281 274 466 


Table pour la variable: ses_languageEnglish 

  0   1 
350 671 


Table pour la variable: ses_languageFrench 

  0   1 
715 306 


Table pour la variable: ses_languageOther 

  0   1 
977  44 


Table pour la variable: ses_educBHS 

  0   1 
806 215 


Table pour la variable: ses_educPostHS 

  0   1 
690 331 


Table pour la variable: ses_educUniv 

  0   1 
546 475 


Table pour la variable: ses_incomeLow 

  0   1 
880 141 


Table pour la variable: ses_incomeMid 

  0   1 
292 729 


Table pour la variable: ses_incomeHigh 

  0   1 
870 151 


Table pour la variable: ses_immigrant 

  0   1 
833 188 


Table pour la variable: ses_ethnicityWhite 

  0   1 
212 809 


Table pour la variable: ses_ethnicityMinority 

  0   1 
809 212 


Table pour la variable: ses_sexOrientationHetero 

  0   1 
127 894 


Table pour la variable: ses_sexOrientationQueer 

  0   1 


In [15]:
# Sauvegarder les données préparées
saveRDS(df_ses, file = "../../_PrivateFolder_datagotchi_federal_2025/data/clustering/can2025/02_pilot_2025_ses.rds")

---

# SUGGESTION : Activité de Clustering

Si souhaité, effectuer un exercice de clustering avec ce sous-groupe de variables pour mieux saisir les données et leur relation. 

Cet exercice est purement exploratoire et sert à voir si les données peuvent être utilisées pour faire du clustering.