In [53]:
library(dplyr)
library(lme4)
library(car) # for VIF calculation
library(tidyverse)
library(boot)
library(modelsummary)

library(jtools)
set.seed(12696921)

CORONA_INTERIM_PATH <- "/m/cs/work/luongn1/digirhythm/data/interim/corona/"
SIMILARITY_PATH <- "/m/cs/work/luongn1/digirhythm/data/processed/corona/similarity_matrix/"

# Read survey data
survey <- read.csv(paste0(CORONA_INTERIM_PATH, "survey_all.csv"))

# Filter out 'non-binary' gender
survey <- survey %>% filter(gender != 'non-binary')

# Read similarity data
sim_baseline <- read.csv(paste0(SIMILARITY_PATH, "si/similarity_baseline_4epochs.csv"), row.names = 1)

# Keep only necessary columns
IVs <- c("subject_id", "age", "gender", "occupation", "origin", "children_at_home", "BIG5_Extraversion", "BIG5_Agreeableness", "BIG5_Conscientiousness", "BIG5_Neuroticism", "BIG5_Openness", "MEQ")
demographics_df <- survey %>% select(all_of(IVs)) %>% drop_na()

# Calculate average similarity
avg_sim_baseline <- rowMeans(sim_baseline, na.rm = TRUE)
avg_sim_baseline <- data.frame(subject_id = rownames(sim_baseline), DV = avg_sim_baseline)

# Merge datasets
dataset <- merge(avg_sim_baseline, demographics_df, by = 'subject_id', all.x = TRUE)

# Define a function to extract the coefficients
boot_fn <- function(data, indices) {
  d <- data[indices, ]  # Extract the bootstrapped sample
  fit <- lm(DV ~ age + origin + occupation + children_at_home + MEQ, data = d)
  return(coef(fit))
}

# Regression analysis with bootstrapping
regression_analysis <- function(df, y, X) {
  df <- df %>% drop_na()
  model <- lm(as.formula(paste(y, "~", paste(X, collapse = "+"))), data = df)
  vif_values <- vif(model)
summ(model, scale=TRUE, vifs=TRUE, confint = TRUE, digits = 3)
}

# Run the analysis
regression_analysis(dataset, "DV", c("age", "origin", "gender", "occupation", "children_at_home", "MEQ"))


[4mMODEL INFO:[24m
[3mObservations:[23m 115
[3mDependent Variable:[23m DV
[3mType:[23m OLS linear regression 

[4mMODEL FIT:[24m
[3mF[23m(6,108) = 2.181, [3mp[23m = 0.050
[3mR² = [23m0.108
[3mAdj. R² = [23m0.059 

[3mStandard errors: OLS[23m
--------------------------------------------------------------------------
                           Est.     2.5%   97.5%    t val.       p     VIF
---------------------- -------- -------- ------- --------- ------- -------
(Intercept)               0.664    0.651   0.677   102.242   0.000        
age                       0.003   -0.003   0.009     1.039   0.301   1.059
origin                   -0.015   -0.030   0.001    -1.829   0.070   1.415
gender1                   0.011   -0.002   0.024     1.649   0.102   1.073
occupation                0.020    0.006   0.035     2.744   0.007   1.482
children_at_home          0.001   -0.005   0.007     0.483   0.630   1.016
MEQ                       0.005   -0.001   0.011     1.726   0.

In [52]:
help(summ.lm)

0,1
summ.lm {jtools},R Documentation

0,1
model,A lm object.
scale,"If TRUE, reports standardized regression coefficients by scaling and mean-centering input data (the latter can be changed via the scale.only argument). Default is FALSE."
confint,Show confidence intervals instead of standard errors? Default is FALSE.
ci.width,"A number between 0 and 1 that signifies the width of the desired confidence interval. Default is .95, which corresponds to a 95% confidence interval. Ignored if confint = FALSE."
robust,"If not FALSE, reports heteroskedasticity-robust standard errors instead of conventional SEs. These are also known as Huber-White standard errors. There are several options provided by sandwich::vcovHC(): ""HC0"", ""HC1"", ""HC2"", ""HC3"", ""HC4"", ""HC4m"", ""HC5"". Default is FALSE. This requires the sandwich package to compute the standard errors."
cluster,"For clustered standard errors, provide the column name of the cluster variable in the input data frame (as a string). Alternately, provide a vector of clusters. Note that you must set robust to either ""HC1"", ""HC2"", or ""HC3"" in order to have clustered standard errors (""HC4"" and ""HC5"" are not supported."
vifs,"If TRUE, adds a column to output with variance inflation factors (VIF). Default is FALSE."
digits,"An integer specifying the number of digits past the decimal to report in the output. Default is 2. You can change the default number of digits for all jtools functions with options(""jtools-digits"" = digits) where digits is the desired number."
pvals,"Show p values? If FALSE, these are not printed. Default is TRUE."
n.sd,"If scale = TRUE, how many standard deviations should predictors be divided by? Default is 1, though some suggest 2."

0,1
coeftable,The outputted table of variables and coefficients
model,The model for which statistics are displayed. This would be most useful in cases in which scale = TRUE.


In [11]:
colnames(survey)