Propensity Score Matching (PSM) for HS vs Controls

In [2]:
install.packages("MatchIt")
library(tidyverse)
library(bigrquery)
library(MatchIt)
library(lubridate)
library(dplyr)
library(data.table)
library(fastDummies)

In [7]:
# --- Load data ---
df =read.csv('cohort_18_to_90_df.csv')

In [9]:
# Select key covariates and remove duplicates
df_sub <- df %>% 
  select(person_id, race, ethnicity, sex_at_birth, HS, age_last_EHR) %>% 
  distinct()

In [18]:
# ---  Standardize sex, race, and ethnicity labels ---
df_sub <- df_sub %>%
  mutate(sex = case_when(
    sex_at_birth %in% c("Male", "Female") ~ sex_at_birth,
    TRUE ~ "Other Sex"
  )) %>%
  mutate(race = case_when(
    race %in% c('White') ~ 'White',
    race %in% c('Black or African American') ~ 'Black or African American',
    race %in% c('Asian') ~ 'Asian',
    race %in% c('More than one population') ~ 'More than one population',
    race %in% c('None of these', 'Middle Eastern or North African', 'Native Hawaiian or Other Pacific Islander ') ~ 'Other race',
    race %in% c('None Indicated', 'PMI: Skip', 'I prefer not to answer') ~ 'No answer race',
    TRUE ~ 'Other race'
  )) %>%
  mutate(ethnicity = case_when(
    ethnicity %in% c('Hispanic or Latino') ~ 'Hispanic or Latino',
    ethnicity %in% c('Not Hispanic or Latino') ~ 'Not Hispanic or Latino',
    ethnicity %in% c('What Race Ethnicity: Race Ethnicity None Of These', 'No matching concept') ~ 'Other ethnicity',
    ethnicity %in% c('PMI: Prefer Not To Answer', 'PMI: Skip') ~ 'No answer ethnicity',
    TRUE ~ 'Others ethnicity'
  ))

In [10]:
df_sub <- df_sub %>%
  mutate(age_normalized = scale(age_last_EHR))

In [27]:
df_sub_dummy <- fastDummies::dummy_cols(df_sub, 
                                   select_columns = c("sex", "race", "ethnicity"), 
                                   remove_first_dummy = FALSE, 
                                   remove_selected_columns = TRUE)
colnames(df_sub_dummy) <- make.names(colnames(df_sub_dummy))

In [45]:
# --- Run PSM using MatchIt (1:10 nearest match)---
start_time <- Sys.time()
m.out0 <- matchit(HS ~ 
                    sex_Female + sex_Male + sex_Other.Sex +
                    race_Asian + race_Black.or.African.American + 
                    race_More.than.one.population + race_No.answer.race + 
                    race_Other.race + race_White +
                    ethnicity_Hispanic.or.Latino + ethnicity_No.answer.ethnicity +
                    ethnicity_Not.Hispanic.or.Latino + ethnicity_Other.ethnicity +
                    age_normalized,
                  data = df_sub_1,
                  method = "nearest", 
                  ratio = 10)
end_time <- Sys.time()
print(end_time - start_time)

In [None]:
# --- Visualize covariate balance (before/after) ---
# Plot distribution of age (normalized)
plot(m.out, type = "density", which.xs = ~age_normalized)

# Generate bar plots for each dummy variable
dummy_vars <- c("sex_Female", "sex_Male", "sex_Other.Sex",
                "race_Asian", "race_Black.or.African.American",
                "race_More.than.one.population", "race_No.answer.race",
                "race_Other.race", "race_White",
                "ethnicity_Hispanic.or.Latino", "ethnicity_No.answer.ethnicity",
                "ethnicity_Not.Hispanic.or.Latino", "ethnicity_Other.ethnicity")

for (var in dummy_vars) {
  png(paste0("matchit_plot_", var, ".png"), width = 800, height = 600)
  plot(m.out0, type = "density", which.xs = as.formula(paste("~", var)))
  dev.off()
}