# Final Proposal (Summary Statistics)

## Data Cleaning

In [1]:
library(tidyverse)
library(haven)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.8     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

“package ‘ggplot2’ was built under R version 4.1.3”
“package ‘tidyr’ was built under R version 4.1.2”
“package ‘readr’ was built under R version 4.1.2”
“package ‘dplyr’ was built under R version 4.1.3”
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

“package ‘haven’ was built under R version 4.1.3”


In [2]:
dat <- read_dta("CCHS_Annual_2017_2018_curated_trimmed_25%.dta") |> 
    select(GEN_010, SPS_040, dhhgage, DHH_SEX, dhhdglvg) |>
    na.omit()

In [3]:
dat_cleaned <- dat |>
    rename(satisfaction = GEN_010, emo_bond = SPS_040, age = dhhgage, sex = DHH_SEX, family = dhhdglvg) |>
    filter(satisfaction < 11 & emo_bond <= 4 & age <= 16 & sex <= 2 & family <= 8) |> #filter out invalid values
    mutate(sex = as_factor(sex),
           emo_bond = as_factor(emo_bond),
         family = as_factor(family),
         age = as_factor(age))

We will take the midpoint of each age group and treat it as a quantitative variable.

In [4]:
dat_cleaned$age <- case_when(dat_cleaned$age == "Age between 12 and 14" ~ 13,
                            dat_cleaned$age == "Age between 15 and 17" ~ 16,
                            dat_cleaned$age == "Age between 18 and 19" ~ 18.5,
                            dat_cleaned$age == "Age between 20 and 24" ~ 22,
                            dat_cleaned$age == "Age between 25 and 29" ~ 27,
                            dat_cleaned$age == "Age between 30 and 34" ~ 32,
                            dat_cleaned$age == "Age between 35 and 39" ~ 37,
                            dat_cleaned$age == "Age between 40 and 44" ~ 42,
                            dat_cleaned$age == "Age between 45 and 49" ~ 47,
                            dat_cleaned$age == "Age between 50 and 54" ~ 52,
                            dat_cleaned$age == "Age between 55 and 59" ~ 57,
                            dat_cleaned$age == "Age between 60 and 64" ~ 62,
                            dat_cleaned$age == "Age between 65 and 69" ~ 67,
                            dat_cleaned$age == "Age between 70 and 74" ~ 72,
                            dat_cleaned$age == "Age between 75 and 79" ~ 77,
                            dat_cleaned$age == "Age 80 and older" ~ 80
)

## Summary Statistics

In [5]:
data_destat <- dat_cleaned |>
    mutate("female" = ifelse(sex == "Male", 0, 1),
           emo_bond_strongly_agree = ifelse(emo_bond == "Strongly agree", 1, 0),
           emo_bond_agree = ifelse(emo_bond == "Agree", 1, 0),
           emo_bond_disagree = ifelse(emo_bond == "Disagree", 1, 0),
          "Unattached individual living alone" = 
               ifelse(family == "Unattached individual living alone.", 1, 0),
          "Unattached individual living with others" = 
               ifelse(family == "Unattached individual living with others.", 1, 0),
          "Individual living with spouse/partner" = 
               ifelse(family == "Individual living with spouse/partner.", 1, 0),
          "Parent living with spouse/partner and child(ren)" = 
               ifelse(family == "Parent living with spouse/partner and child(ren).", 1, 0),
          "Single parent living with children" = 
               ifelse(family == "Single parent living with children.", 1, 0),
          "Child living with a single parent with or without siblings" = 
               ifelse(family == "Child living with a single parent with or without siblings.", 1, 0),
          "Child living with two parents with or without siblings" = 
               ifelse(family == "Child living with two parents with or without siblings", 1, 0))

In [6]:
mean_table <- data_destat |>
    select(-c("sex", "emo_bond", "family")) |>
    summarize_all(mean)

sd_table <- data_destat |>
    select(-c("sex", "emo_bond", "family")) |>
    summarize_all(sd)

max_table <- data_destat |>
    select(-c("sex", "emo_bond", "family")) |>
    summarize_all(max)

min_table <- data_destat |>
    select(-c("sex", "emo_bond", "family")) |>
    summarize_all(min)

summary_table <- rbind(mean_table, sd_table, max_table, min_table) |>
    rename("satisfaction with life in general" = satisfaction,
           "strong emotional bond with >= 1 person (strongly agree)" = emo_bond_strongly_agree,
           "strong emotional bond with >= 1 person (agree)" = emo_bond_agree,
           "strong emotional bond with >= 1 person (disagree)" = emo_bond_disagree)

summary_table <- t(summary_table)

colnames(summary_table) <- c("mean", "standard deviation", "max", "min")

In [7]:
summary_table

Unnamed: 0,mean,standard deviation,max,min
satisfaction with life in general,8.03062731,1.6924781,10,0
age,48.54778598,19.5925424,80,13
female,0.54059041,0.4983803,1,0
strong emotional bond with >= 1 person (strongly agree),0.58154982,0.4933351,1,0
strong emotional bond with >= 1 person (agree),0.38093481,0.4856465,1,0
strong emotional bond with >= 1 person (disagree),0.03333333,0.1795165,1,0
Unattached individual living alone,0.2798278,0.4489421,1,0
Unattached individual living with others,0.03677737,0.1882263,1,0
Individual living with spouse/partner,0.2897909,0.4536931,1,0
Parent living with spouse/partner and child(ren),0.18733087,0.3902009,1,0
