R_script.R

# Synthetic datasets: A non-technical primer for the biobehavioral sciences
# Author: Daniel S. Quintana

# Correspondence to Daniel S. Quintana, NORMENT KG Jebsen Centre for Psychosis Research,
# University of Oslo
# Email: daniel.quintana@medisin.uio.no

# Load required packages

# This is a function that will check to see if packages are installed.
# If they are not, they will be installed.
# After checking, they will be loaded into the R session
# Source: https://gist.github.com/stevenworthington/3178163

ipak <- function(pkg) {
  new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
  if (length(new.pkg))
    install.packages(new.pkg, dependencies = TRUE)
  sapply(pkg, require, character.only = TRUE)
}

packages <- c("synthpop", "tidyverse", "cowplot", "car")
ipak(packages)


################################

## Manuscript example 1: Oxytocin and sprituality

ot_dat <- read_csv("ot_dat.csv") # Loads data

ot_dat <- ot_dat %>%
  rename(
    OT_condition = OT_COND,
    rel_affiliation = rel_aff_cat,
    spirituality = spi_1_L
  )  # Renames the variables for easier figure interpretation

## Figure 1a 

ot_sim <- syn(ot_dat, seed = 1337) # Creates synthetic data

ot_com <- compare(
  ot_sim,
  ot_dat,
  vars = c("OT_condition", "Age",
           "spirituality", "rel_affiliation"),
  print.coef = TRUE,
  ncol = 4,
  breaks = 16,
  stat = "counts",
  cols = c("#62B6CB", "#1B4965")
) # Visual comparison of original and synthetic datasets

fig_1a <- ot_com$plots # Extracts plots from the "ot_com" object

fig_1a <- fig_1a +
  scale_y_continuous(expand = c(0, 0)) + # Forces the y-axis to start at zero
  theme_minimal_hgrid(12) # Applies a theme from the 'cowplot' package

fig_1a <- fig_1a +
  theme(axis.text.x = element_text(angle = 60, hjust = 1),
        # Adjusts x-axis tick labels
        axis.title.x = element_blank()) + # Removes x-axis title
  labs(fill = "Dataset") # Renames legend title

fig_1a

#####

## Supplementary Figures 

# To generate figures, see full R script on the project's OSF page https://osf.io/z524n/ 
# This section of the analysis was removed from the Rstudio server instance due to loading constraints

#####

## Check for replicated unique units 

ru <- replicated.uniques(ot_sim, ot_dat)
ru

#####

## t-test

a = t.test(spirituality ~ OT_condition,
           data = ot_dat,
           var.equal = FALSE) # Welch's t-test
a 

a1 = lm(ot_dat$spirituality ~ 1 +
          ot_dat$OT_condition) # Linear model equivalent of above t-test

summary(a1) # Confirming results are the same as the t-test
confint(a1)

s_lm <- lm.synds(spirituality ~ 1 +
                   OT_condition,
                 data = ot_sim) # Linear model equivalent in synthetic data

syn <- summary(s_lm) # Synthetic linear model results
syn

t_test_com <- compare(
  s_lm,
  ot_dat,
  lwd = 1.5,
  lty = 1,
  point.size = 4,
  lcol = c("#62B6CB", "#1B4965")
) # A comparison of the linear models

t_test_com

fig_1b <- t_test_com$ci.plot


fig_1b <- fig_1b + ggtitle("") +
  theme(axis.text.y = element_blank())

fig_1b <- fig_1b + theme_half_open() +
  background_grid()

fig_1b <- fig_1b +
  theme(axis.text.y = element_blank()) +
  scale_x_discrete(breaks = NULL)

fig_1b <- fig_1b +
  annotate("text",
           x = 1,
           y = -1,
           label = "Nasal spray condition")
fig_1b

#####

## Correlation

a_cor = cor.test(ot_dat$Age, ot_dat$spirituality,
                 method = "pearson") # Calculate correlation
a_cor

b_cor = lm(scale(ot_dat$Age) ~ 1 +
             scale(ot_dat$spirituality)) # Linear model equivalent of correlation

summary(b_cor) # Linear model results
confint(b_cor) # Print confidence intervals for linear model coefficients

s_cor <- lm.synds(scale(Age) ~ 1 +
                    scale(spirituality),
                  data = ot_sim) # Linear model equivalent in synthetic data

syn_cor <- summary(s_cor) # Results of linear model equivalent in synthetic data
syn_cor

cor_com <- compare(
  s_cor,
  ot_dat,
  lwd = 1.5,
  lty = 1,
  point.size = 4,
  lcol = c("#62B6CB", "#1B4965")
)

cor_com

fig_1c <- cor_com$ci.plot # Extract plot from "cor_com" object

fig_1c <- fig_1c + ggtitle("") + # Remove title
  theme(axis.text.y = element_blank()) # Remove y-axis text

fig_1c <- fig_1c + theme_half_open() +
  background_grid() # Apply new theme

fig_1c <- fig_1c +
  theme(axis.text.y = element_blank()) +
  scale_x_discrete(breaks = NULL, name = "") # Remove x-axis text

fig_1c <- fig_1c +
  annotate("text",
           x = 1,
           y = 0.7,
           label = "Spirituality") # Add label to plot
fig_1c

#####

## Ancova

anc = car::Anova(aov(spirituality ~ OT_condition + rel_affiliation, 
                     data = ot_dat))
anc

anc_lm <- lm(spirituality ~ 1 +
               OT_condition + rel_affiliation,
             data = ot_dat) # Linear model equivalent of ANOVA

summary(anc_lm) # Results from linear model

# Testing for main effect of group to confirm equivalancy

null_rel = lm(spirituality ~ 1 +
                rel_affiliation,
              data = ot_dat) # Null model without OT condition

result_rel = anova(null_rel, anc_lm) # Comparison of null and full model
result_rel # Comparison of null and full model, yielding the same F statistic and p-value

s_ancova <-
  lm.synds(spirituality ~ 1 + OT_condition + rel_affiliation,
           data = ot_sim) # Linear model equivalent in synthetic data

syn_anc <- summary(s_ancova) # Results from synthetic linear model
syn_anc

anc_com <- compare(
  s_ancova,
  ot_dat,
  lwd = 1.5,
  lty = 1,
  point.size = 4,
  plot.intercept = FALSE,
  lcol = c("#62B6CB", "#1B4965")
) # Comparison of linear and synthetic model

anc_com

anc_plot <-  anc_com$ci.plot # Extract plot from "anc_com" object

fig_1d <- anc_plot + ggtitle("") +
  theme(axis.text.y = element_blank()) # Remove title and y-axis text

fig_1d <- fig_1d + theme_half_open() +
  background_grid() # Apply theme

fig_1d <- fig_1d +
  theme(axis.text.y = element_blank()) +
  scale_x_discrete(breaks = NULL, name = "") # Remove x-axis text

fig_1d <- fig_1d +
  annotate("text",
           x = 1.02,
           y = -6.5,
           label = "Nasal spray condition") +
  annotate("text",
           x = 2.02,
           y = -1.8,
           label = "Religious affiliation") # Add labels
fig_1d

#####

## Construct figure 1

p1_top <- plot_grid(fig_1a,
                    labels = c('A'),
                    ncol = 1,
                    label_size = 12) # Create top panel

p1_bottom <- plot_grid(
  fig_1b + theme(legend.position = "none"),
  fig_1c + theme(legend.position = "none"),
  fig_1d + theme(legend.position = "none"),
  labels = c('B', 'C', 'D'),
  ncol = 3,
  rel_widths = c(1, 1, 1),
  label_size = 12
) # Create top panel with stripped legends

legend <- get_legend(fig_1c + theme(legend.box.margin = margin(0, 0, 0, 12))) # Extract legend and create some space

p1_bottom <- plot_grid(p1_bottom,
                       NULL,
                       legend,
                       NULL,
                       ncol = 4,
                       rel_widths = c(3, 0.1, .2, .1)) # Add legend and some more space

p1_bottom <- plot_grid(NULL,
                       p1_bottom,
                       NULL,
                       ncol = 4,
                       rel_widths = c(0.2, 2.2, 0.2)) # Adding a little more space

fig1 <- plot_grid(p1_top, p1_bottom,
                  nrow = 2) # Putting it all together

fig1 # Print at 14 x 6 inches for same dimensions as manuscript

## Prepare data for sharing

ot_synthetic_label <- sdc(ot_sim, ot_dat, 
                          label = "FAKE_DATA") # Adds a "FAKE_DATA" label

ot_synthetic_dat <- ot_synthetic_label$syn # Extracts the synthetic data to a dataframe for sharing

##### 

### Manuscript example 2: Oxytocin concentrations and theory of mind performance

## Original data source: https://data.mendeley.com/datasets/h3f6ywpd5t/1

b_dat <- read_csv("blood.csv") # Import data

vars_b <- c("EQ", "RMET", "OT", "Sex")
b_dat <- b_dat[, vars_b] # Select variables of interest

b_dat_s <- syn(b_dat, seed = 738) # Create synthetic dataset

fig_2a <- compare(
  b_dat_s,
  b_dat,
  stat = "counts",
  breaks = 12,
  ncol = 2,
  cols = c("#62B6CB", "#1B4965")
) # Compare datasets

fig_2a <- fig_2a$plots # Extract plots from "Fig_2a" object

fig_2a <- fig_2a +
  scale_y_continuous(expand = c(0, 0)) + # Force y-axis to start at zero
  theme_minimal_hgrid(12) # Apply theme

fig_2a <- fig_2a +
  theme(axis.text.x = element_text(angle = 60, hjust = 1),
        axis.title.x = element_blank()) +
  labs(fill = "Dataset")

#####

## Check for replicated unique values

ru_b <- replicated.uniques(b_dat_s, b_dat)
ru_b
####

## RMET

rmet_m <- lm(RMET ~ 1 +
               OT + Sex,
             data = b_dat) # linear model

rmet_m_s <- summary(rmet_m) # Result from linear model
rmet_m_s

rmet_m_syn <- lm.synds(RMET ~ 1 + OT + Sex,
                       data = b_dat_s) # Equivalent linear model in synthetic data

rmet_m_syn_s <- summary(rmet_m_syn) # Results from linear model in synthetic data
rmet_m_syn_s

fig_2b <- compare(
  rmet_m_syn,
  b_dat,
  lwd = 1.5,
  lty = 1,
  point.size = 4,
  lcol = c("#62B6CB", "#1B4965")
) # Comparison of linear models
fig_2b

fig_2b <-  fig_2b$ci.plot # Extract plot from the "fig_2b" object

fig_2b <- fig_2b + ggtitle("") +
  theme(axis.text.y = element_blank())  # Remove title and y-axis text

fig_2b <- fig_2b + theme_half_open() +
  background_grid() # Add theme

fig_2b <- fig_2b +
  theme(axis.text.y = element_blank()) +
  scale_x_discrete(breaks = NULL, name = "Coefficient") # Remove x-axis text

fig_2b <- fig_2b +
  annotate("text",
           x = 2,
           y = -1,
           label = "Oxytocin concentration") +
  annotate("text",
           x = 1,
           y = -0.35,
           label = "Sex") # Add labels

## Plot grid

fig2 <- plot_grid(
  fig_2a,
  NULL,
  fig_2b,
  labels = c('A', '', 'B'),
  ncol = 3,
  rel_widths = c(2, 0.15, 1.5)
) # Combine plots (some space was added in between the plots)

fig2 # Print at 14 x 5 inches for same dimensions as manuscript

## Prepare data for sharing

ot_blood_label <- sdc(b_dat_s, b_dat, 
                      label = "FAKE_DATA") # Adds a "FAKE_DATA" label

ot_blood_synthetic_dat <- ot_blood_label$syn # Extracts the synthetic data to a dataframe for sharing

#####

## Manuscript example 3: Sociosexuality and self-rated attractiveness

## Original data source: https://osf.io/6bk3w/

socio_dat <- read_csv("socio.csv") # Import data

socio_dat <- socio_dat %>% drop_na() # Drop NAs

socio_dat <-  socio_dat %>% filter(sex %in% c("male", "female", "intersex")) 

socio_dat_s <- syn(socio_dat, seed = 122) # Create synthetic dataset

fig_3a <- compare(
  socio_dat_s,
  socio_dat,
  breaks = 12,
  ncol = 7,
  nrow = 2,
  cols = c("#62B6CB", "#1B4965")
) # Compare datasets

fig_3a <- fig_3a$plots # Extract plots from "Fig_2a" object

fig_3a <- fig_3a +
  scale_y_continuous(expand = c(0, 0)) + # Force y-axis to start at zero
  theme_minimal_hgrid(12) # Apply theme

fig_3a <- fig_3a +
  theme(axis.text.x = element_text(angle = 60, hjust = 1),
        axis.title.x = element_blank()) +
  labs(fill = "Dataset")

fig_3a

# Models

socio_lm <- lm(behavior2 ~ 1 +
                 sra + age + lab,
               data = socio_dat) # linear model

socio_dat_sum <- summary(socio_lm) # Result from linear model
socio_dat_sum

socio_lm_syn <- lm.synds(behavior2 ~ 1 + sra + age + lab,
                         data = socio_dat_s) # Equivalent linear model in synthetic data


socio_lm_syn_s <- summary(socio_lm_syn) # Results from linear model in synthetic data
socio_lm_syn_s

fig_3b <- compare(
  socio_lm_syn,
  socio_dat,
  breaks = 12,
  ncol = 7,
  nrow = 2,
  cols = c("#62B6CB", "#1B4965")
) # Compare datasets

fig_3b

fig_3b <-  fig_3b$ci.plot # Extract plot from the "fig_3b" object

fig_3b <- fig_3b + ggtitle("") +
  theme(axis.text.y = element_blank())  # Remove title and y-axis text

fig_3b <- fig_3b + theme_half_open() +
  background_grid() # Add theme

fig_3b <- fig_3b +
  theme(axis.text.y = element_blank()) +
  scale_x_discrete(breaks = NULL, name = "Coefficient") # Remove x-axis text

fig_3b <- fig_3b +
  annotate("text",
           x = 3,
           y = 13.8,
           label = "SRA") +
  annotate("text",
           x = 2,
           y = 27.5,
           label = "Age") +
  annotate("text",
           x = 1,
           y = 1.1,
           label = "Location")
fig_3b

## Detect replicated individuals and prepare synthetic dataset for sharing

dim(socio_dat_s$syn) # Rows and columns before removal of replicated uniques

socio_dat_s_sdc <- sdc(socio_dat_s, socio_dat, 
                       label = "FAKE_DATA", 
                       rm.replicated.uniques = TRUE) # Remove replicated uniques and add FAKE label

dim(socio_dat_s_sdc$syn) # Rows and columns AFTER removal of replicated uniques

socio_synthetic_dat <- socio_dat_s_sdc$syn # Extracts the synthetic data (replicated uniques removed) to a dataframe for sharing

# Regression model with uniques excluded

socio_lm_syn_ue <- lm.synds(behavior2 ~ 1 + sra + age + lab,
                            data = socio_dat_s_sdc) # Equivalent linear model in synthetic data

socio_lm_syn_ue_s <- summary(socio_lm_syn_ue) # Results from linear model in synthetic data
socio_lm_syn_ue_s


fig_3c <- compare(
  socio_lm_syn_ue,
  socio_dat,
  breaks = 12,
  ncol = 7,
  nrow = 2,
  cols = c("#62B6CB", "#1B4965")
) # Compare datasets

fig_3c

fig_3c <-  fig_3c$ci.plot # Extract plot from the "fig_3b" object

fig_3c <- fig_3c + ggtitle("") +
  theme(axis.text.y = element_blank())  # Remove title and y-axis text

fig_3c <- fig_3c + theme_half_open() +
  background_grid() # Add theme

fig_3c <- fig_3c +
  theme(axis.text.y = element_blank()) +
  scale_x_discrete(breaks = NULL, name = "Coefficient") # Remove x-axis text

fig_3c <- fig_3c +
  annotate("text",
           x = 3,
           y = 13.8,
           label = "SRA") +
  annotate("text",
           x = 2,
           y = 27.5,
           label = "Age") +
  annotate("text",
           x = 1,
           y = 1.1,
           label = "Location")
fig_3c

# Create figure 3 plot
# First create regression model panels

fig3bc <- plot_grid(
  fig_3b,
  NULL,
  fig_3c,
  labels = c('B', '', 'C'),
  ncol = 1,
  rel_heights = c(1, 0.01, 1.)
) 

fig3bc

fig3 <- plot_grid(
  fig_3a,
  NULL,
  fig3bc,
  labels = c('A', '', ''),
  ncol = 3,
  rel_widths = c(5, 0.15, 1.5)
) # Combine plots (some space was added in between the plots)

fig3