-
Notifications
You must be signed in to change notification settings - Fork 0
/
quality_control.R
503 lines (406 loc) · 19.3 KB
/
quality_control.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
# QUALITY CONTROL
########################################################################
# Questionnaire Response Keys
########################################################################
# Highest level of education completed (LIS07_education_completed)
# 1 = Basisonderwijs = primary education
# 2 = VMBO = pre-vocational secondary education
# 3 = HAVO/VWO = senior general seconday education/pre-university secondary education
# 4 = MBO = secondary vocational education
# 5 = HBO = higher professional education
# 6 = WO = academic education
# 7 = Other
# 9 = Did not receive any education
# Handedness (VAR101_left_right_handed)
# 1 = left-handed
# 2 = right-handed
# Net monthly income for the complete household, including income from work, allowances, alimony, and benefits (LIS23_net_household_income)
# 0 = no income
# 1 = 500 euros or less
# 2 = 501 – 1000 euros
# 3 = 1001 – 1500 euros
# 4 = 1501 – 2000 euros
# 5 = 2001 – 2500 euros
# 6 = 2501 – 3000 euros
# 7 = 3001 – 3500 euros
# 8 = 3501 – 4000 euros
# 9 = 4001 – 4500 euros
# 10 = 4501 – 5000 euros
# 11 = 5001 – 7500 euros
# 12 = more than 7500 euros
# 13 = I don’t know
# 14 = I prefer not to answer
# Sex (sex)
# 1 = male
# 2 = female
library(readr)
setwd('/Volumes/project/3022060.01')
concatenated_all <- read_csv("analysis/concatenated_all.csv")
data <- concatenated_all
names(data)
########################################################################
# Histograms for categorical variables
########################################################################
library(ggplot2)
library(cowplot)
columns_categorical <- c("LIS07_education_completed",
"VAR101_left_right_handed",
"LIS23_net_household_income",
"sex")
# Create an empty list to store plots
plot_list <- list()
# Loop through each column
for (column in columns_categorical) {
# Calculate the number of bins
min_val <- min(data[[column]], na.rm = TRUE)
max_val <- max(data[[column]], na.rm = TRUE)
num_bins <- max_val - min_val + 1 # Number of integer values between min and max, including min and max
# Create histogram using ggplot2
p <- ggplot(data, aes(x = !!rlang::sym(column))) +
geom_histogram(binwidth = 1, fill = "#c9c9c9", color = "#000000") + # Set binwidth to 1 to ensure each integer value has its own bin
scale_x_continuous(breaks = seq(min_val, max_val, by = 1)) + # Set breaks to show every integer value
ggtitle(column) +
theme_classic() +
theme(plot.margin = margin(10, 10, 10, 10, "pt"),
axis.title.x = element_blank(),
plot.title = element_text(hjust = 0.5)) # Adjust plot margins
# Add the plot to the list
plot_list[[length(plot_list) + 1]] <- p
}
# Arrange plots in a grid
#combined_plot <- cowplot::plot_grid(plotlist = plot_list, ncol = 2)
#print(combined_plot)
# Save the plot to a PDF file
#ggsave("plots/histograms_categorical_variables.pdf", combined_plot, width = 8.27, height = 11.69) # A4 size in inches
########################################################################
# Violin, box, and scatterplots for continuous variables
########################################################################
columns_continuous <- c("SPS_positive", "SPS_negative", "SPS_sum",
"ambiguity_aversion",
"tg_sent", # trust
"tg_return_sum", # trustworthiness
"riskaversion", "prudence",
"patience_outcome",
"BIG_openness_sum","BIG_neuroticism_sum",
"age")
# Create an empty list to store plots
#plot_list <- list()
# Loop through each column
for (column in columns_continuous) {
# Calculate mean
mean_val <- mean(data[[column]], na.rm = TRUE)
# Create violin plot
p <- ggplot(data, aes(x = 1, y = !!rlang::sym(column))) +
geom_jitter(width = 0.2, alpha = 0.2, size = 1, color='#2389CA') +
geom_violin(fill = "transparent", color='#000000') +
geom_boxplot(width = 0.1, fill = "transparent", color = "#000000") +
geom_hline(yintercept = mean_val, color = "darkred", linetype = "dashed", size = 0.75) + # Add mean line
ggtitle(column) +
theme_classic() +
theme(plot.margin = margin(10, 10, 10, 10, "pt"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
plot.title = element_text(hjust = 0.5))
# Add the plot to the list
plot_list[[length(plot_list) + 1]] <- p
}
# Arrange plots in a grid
combined_plot <- cowplot::plot_grid(plotlist = plot_list, ncol = 4)
print(combined_plot)
# Save the plot to a PDF file
ggsave("plots/qc_plots.pdf", combined_plot, width = 11, height = 11)
########################################################################
# Print summary stats
########################################################################
data <- concatenated_all
# Create an empty data frame to store summary statistics
summary_df <- data.frame(Column = character(),
Min = numeric(),
Q1 = numeric(),
Median = numeric(),
Mean = numeric(),
Q3 = numeric(),
Max = numeric(),
SD = numeric(),
stringsAsFactors = FALSE)
# Iterate over continuous columns to calculate summary statistics
for (column in columns_continuous) {
# Calculate summary statistics
column_summary <- summary(data[[column]], na.rm = TRUE)
column_summary <- round(column_summary, 2) # Round to 2 decimal points
column_sd <- round(sd(data[[column]], na.rm = TRUE), 2) # Round to 2 decimal points
# Append summary statistics to the data frame
summary_df <- rbind(summary_df, c(column, column_summary, column_sd))
}
# Assign column names to the data frame
colnames(summary_df) <- c("Variable", "Min", "Q1", "Median", "Mean", "Q3", "Max", "SD")
# Print the summary data frame
print(summary_df)
# Export the summary data frame to a CSV file
write.csv(summary_df, file = "analysis/summary_statistics.csv", row.names = FALSE)
########################################################################
# No removed outliers
########################################################################
# Variables are all within a certain range or even ordinal such that excluding outliers would be arbitrary
########################################################################
# No transformations
########################################################################
# Using non-parametric methods or robust SEM techniques such as bootstrapping or permutation tests do not rely on assumptions about the distribution of the data and can be more robust to violations of normality.
########################################################################
# Standardization: z-score across all variables
########################################################################
library(dplyr)
data <- concatenated_all
# Function to calculate z-score
calculate_zscore <- function(x) {
(x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}
# Z-score all columns except 'subject'
zscored_data <- data %>%
mutate(across(-subject, calculate_zscore))
head(zscored_data)
# Save
write.csv(zscored_data, "analysis/concatenated_all_z.csv", row.names = FALSE)
########################################################################
# Exploratory factor analysis: check underlying structure
########################################################################
library(readr)
library(ggplot2)
library(gridExtra)
library(reshape2)
library(ggpubr)
library(grid)
library(rstatix)
library(data.table)
library(ggcorrplot)
library(igraph)
library(ggraph)
library(tidygraph)
library(viridis)
library(Hmisc)
# Function to grab just the legend from a plot
library(gridExtra)
get_legend<-function(myggplot){
tmp <- ggplot_gtable(ggplot_build(myggplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
return(legend)
}
concatenated_all_z <- read_csv("analysis/concatenated_all_z.csv")
data <- as.data.table(concatenated_all_z)
names(data)
# Rename columns
setnames(data, old = c('SPS_sum', 'SPS_positive', 'SPS_negative', 'ambiguity_aversion', 'tg_sent', 'tg_return_sum', 'riskaversion', 'prudence', 'patience_outcome', 'sex', 'age', 'BIG_neuroticism_sum', 'BIG_openness_sum'),
new = c('SPS sum', 'SPS positive dimension', 'SPS negative dimension', 'ambiguity aversion index', 'trust', 'trustworthiness', 'risk aversion', 'prudence', 'patience', 'sex (M-, F+)', 'age', 'neuroticism', 'openness'))
names(data)
# Define columns to be selected
selected_columns <- c('SPS sum', 'SPS positive dimension', 'SPS negative dimension', 'ambiguity aversion index', 'trust', 'trustworthiness', 'risk aversion', 'prudence', 'patience', 'sex (M-, F+)', 'age', 'neuroticism', 'openness')
# Subset data to include only the selected columns using .SD
data_subset <- data[, .SD, .SDcols = selected_columns]
names(data_subset)
# Convert data subset to numeric matrix
data_subset_matrix <- as.matrix(data_subset)
# Compute correlation matrix between selected variables
cor.mat <- cor(data_subset_matrix, method = "spearman", use = "pairwise.complete.obs")
# Save
write.csv(cor.mat, "analysis/cor_mat.csv", row.names = TRUE)
# Compute p-values for correlation matrix
p.mat <- rcorr(as.matrix(data_subset_matrix))$P
# Save
write.csv(p.mat, "analysis/p_mat.csv", row.names = TRUE)
# Create correlation plot
pi <- ggcorrplot(cor.mat,
method = "square",
type = "lower",
outline.color = "#ffffff",
lab = TRUE,
digits = 2,
colors = c("#0048A0","#ffffff","#C10000"), # order: low, middle, high
tl.col = "#000000",
legend.title = "Spearman\ncorrelation",
hc.order = FALSE,
theme(legend.key.width = unit(2.5, 'cm'),
legend.position = "bottom"))
p1 <- ggcorrplot(cor.mat,
p.mat = p.mat,
method = "square",
type = "lower",
outline.color = "#ffffff",
lab = TRUE,
digits = 2,
colors = c("#0048A0","#ffffff","#C10000"), # order: low, middle, high
tl.col = "#000000",
legend.title = "Spearman\ncorrelation",
hc.order = FALSE,
lab_size = 3)
# Pick which legend from which plot
legend <- get_legend(pi)
# Remove legend from main plot
p1 <- p1 + theme(legend.position="none")
########################################################################
# Network analysis
########################################################################
library(readr)
# Read the CSV file into a tibble
cor_mat <- read_csv("analysis/cor_mat.csv")
# Convert the tibble to a data frame
cor_mat <- as.data.frame(cor_mat)
# Set row names from the first column
rownames(cor_mat) <- cor_mat[[1]]
# Remove the first column
cor_mat <- cor_mat[, -1, drop = FALSE]
# Remove the row named 'SPS sum'
cor_mat <- cor_mat[!rownames(cor_mat) %in% "SPS sum", , drop = FALSE]
# Remove the column named 'SPS sum'
cor_mat <- cor_mat[, !colnames(cor_mat) %in% "SPS sum", drop = FALSE]
p_mat <- read_csv("analysis/p_mat.csv")
p_mat <- as.data.frame(p_mat)
rownames(p_mat) <- p_mat[[1]]
p_mat <- p_mat[, -1, drop = FALSE]
p_mat <- p_mat[!rownames(p_mat) %in% "SPS sum", , drop = FALSE]
p_mat <- p_mat[, !colnames(p_mat) %in% "SPS sum", drop = FALSE]
# Vertices (or nodes): name of each variable
va <- names(cor_mat[,c(1:12)])
# Edges: correlation value (r) and significance (p)
## Melt data
cor_mat$id.vars <- rownames(cor_mat)
p_mat$id.vars <- rownames(p_mat)
id.vars <- names(cor_mat[,c(13)])
measure.vars <- names(cor_mat[,c(1:12)])
cor_mat <- as.data.table(cor_mat)
p_mat <- as.data.table(p_mat)
cor_melt <- melt(cor_mat, id.vars=c(id.vars), measure.vars=c(measure.vars))
head(cor_melt)
p_melt <- melt(p_mat, id.vars=c(id.vars), measure.vars=c(measure.vars))
head(p_melt)
# Merge vertices and edges together
ed <- merge(cor_melt, p_melt[,c("id.vars","variable","value")], by=c("id.vars","variable"), all=TRUE) # merge correlations and p-values
head(ed)
colnames(ed) <- c('node_a','node_b','r','p') # rename
## delete nonsignificant correlations
ed_sig <- ed[!(ed$p > 0.05),]
# generate graph object
ig <- igraph::graph_from_data_frame(d=ed_sig, vertices=va, directed = FALSE)
# add labels to nodes
tg <- tidygraph::as_tbl_graph(ig) %>% # graph object
tidygraph::activate(nodes) %>%
dplyr::mutate(label=c('SPS\npositive\ndimension', 'SPS\nnegative\ndimension', 'ambiguity\naversion\nindex', 'trust', 'trustworthiness', 'risk\naversion', 'prudence', 'patience', 'sex\n(M-, F+)', 'age', 'neuroticism', 'openness'))
# Print nodes data frame and grouping variable vector; make sure they match
print(as_tibble(tg))
# Add the grouping variable to the nodes data frame
tg <- tg %>%
mutate(group_variable = c(1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3))
tg #tidygraph object with node and edge data
# Calculate degree centrality
degree.cent <- centr_degree(tg, mode = "all")
degree.cent$res
# Calculate closeness centrality: degree of closeness or proximity of a node to all other nodes in a network
closeness.cent <- closeness(tg, mode="all")
closeness.cent
# Plot in a network
library(ggraph)
library(ggplot2)
p2 <- ggraph(tg, layout = 'stress', circular = FALSE) +
geom_edge_arc(lineend = 'butt', linejoin = 'round',
linemitre = 2,
strength = 0,
edge_width = 1,
aes(colour = r)) +
geom_node_point(size = 7,
alpha = 0.6,
aes(colour = factor(group_variable))) +
geom_node_text(aes(label = label),
repel = TRUE,
point.padding = unit(2, "lines"),
size=4,
colour="#000000") +
theme_graph(background = "white") +
theme(legend.position = "right") +
guides(edge_width = 'none',
edge_alpha = 'none') +
scale_edge_colour_gradient2(
'Spearman r',
low = "#0048A0",
mid = "#ffffff",
high = "#C10000",
midpoint = 0,
space = "Lab",
na.value = "#000000",
guide = "edge_colourbar",
aesthetics = "edge_colour",
limits = c(-1, 1)) +
scale_color_manual(name = "Variable group",
breaks = c(1, 2, 3), # Specify breaks to match levels of group_variable
labels = c("SPS", "Decision-making", "Covariate"), # Add labels for each level
values = c("purple", "darkorange", "darkgreen")) # Add colors for each level
# Arrange plot and legend vertically
combined_plot <- grid.arrange(p1, legend, p2,
ncol = 1, nrow=3,
heights = c(1, 0.1, 1))
# Save
ggsave("plots/corr_network_plot.pdf", plot = combined_plot, width = 8.27, height = 11.69) # A4 size
########################################################################
# Internal consistency
########################################################################
# Merge SPSQ item-wise scores across subjects
library(dplyr)
library(readr)
working_dir <- 'analysis/data'
out_dir <- 'analysis'
subjects <- list.dirs(working_dir, recursive = FALSE, full.names = TRUE)
# Define the column names for merging
columns <- c(subject, names(qst_post_3_SPS))
spsq_all <- NULL
for (i in 1:length(subjects)) {
subject <- subjects[i]
subject_num <- sub("_.*", "", basename(subject))
print(paste("Processing:", subject_num))
data <- read_csv(file.path(subject, 'qst-post-3_SPS.csv'), show_col_types = FALSE)
# Add subject column to data
data$subject <- subject_num
if (is.null(spsq_all)) {
spsq_all <- data
} else {
# Identify common columns for merging
common_cols <- intersect(names(spsq_all), names(data))
# Merge data frames using common columns
spsq_all <- merge(spsq_all, data, by = common_cols, all = TRUE)
}
}
write_csv(spsq_all, file.path(out_dir, 'spsq_all.csv'))
names(spsq_all)
# Calculate Cronbach's alpha for the positive and negative dimensions of SPSQ-24
library(psych)
# Define function to calculate Cronbach's alpha for a given set of items
calculate_alpha <- function(items, dataframe) {
# Subset dataframe with only the columns corresponding to the given items
subset_data <- dataframe[, items]
# Calculate Cronbach's alpha
alpha_result <- psych::alpha(subset_data)
return(alpha_result$total$raw_alpha)
}
# Define the lists of items
spsq_sum <- c('SPS13_emotionally_touched_music_art', 'SPS29_notice_subtle_touching_tones_music', 'SPS31_very_movedy_nice_work_of_art', 'SPS02_nervous_to_many_things_at_once', 'SPS24_rushed_too_much_little_time', 'SPS28_upset_when_people_ask_many_things_at_once', 'SPS03_see_sad_eyes_behind_smile', 'SPS04_strikes_tone_voice_not_matching_words', 'SPS09_looking_eyes_telling_truth', 'SPS12_strikes_when_acting_not_afraid', 'SPS22_tell_smile_masking_feelings',
'SPS05_reversed', #'SPS05_hard_enjoy_little_things_reversed'
'SPS10_feel_good_with_people_I_love', 'SPS17_enjoy_humour_situations', 'SPS19_enjoy_relaxing_activity', 'SPS30_watching_nice_movie_feels_good', 'SPS06_flashing_lights_bother', 'SPS16_easily_disturbed_light_odors', 'SPS20_loud_noises_irritating', 'SPS25_suffer_bright_light', 'SPS14_immediately_feel_mouth_throat_drier', 'SPS15_hardly_visible_details_attract_attention', 'SPS21_quickly_aware_changes_body', 'SPS23_notice_faints_smells')
positive_dimension <- c('SPS13_emotionally_touched_music_art', 'SPS29_notice_subtle_touching_tones_music', 'SPS31_very_movedy_nice_work_of_art', 'SPS03_see_sad_eyes_behind_smile', 'SPS04_strikes_tone_voice_not_matching_words', 'SPS09_looking_eyes_telling_truth', 'SPS12_strikes_when_acting_not_afraid', 'SPS22_tell_smile_masking_feelings',
'SPS05_reversed', #'SPS05_hard_enjoy_little_things_reversed'
'SPS10_feel_good_with_people_I_love', 'SPS17_enjoy_humour_situations', 'SPS19_enjoy_relaxing_activity', 'SPS30_watching_nice_movie_feels_good', 'SPS14_immediately_feel_mouth_throat_drier', 'SPS15_hardly_visible_details_attract_attention', 'SPS21_quickly_aware_changes_body', 'SPS23_notice_faints_smells')
negative_dimension <- c('SPS02_nervous_to_many_things_at_once', 'SPS24_rushed_too_much_little_time', 'SPS28_upset_when_people_ask_many_things_at_once', 'SPS06_flashing_lights_bother', 'SPS16_easily_disturbed_light_odors', 'SPS20_loud_noises_irritating', 'SPS25_suffer_bright_light')
# Calculate Cronbach's alpha for each list
alpha_spsq_sum <- calculate_alpha(spsq_sum, spsq_all)
alpha_positive_dimension <- calculate_alpha(positive_dimension, spsq_all)
alpha_negative_dimension <- calculate_alpha(negative_dimension, spsq_all)
# Output the results
cat("Cronbach's alpha for spsq_sum:", alpha_spsq_sum, "\n")
cat("Cronbach's alpha for positive_dimension:", alpha_positive_dimension, "\n")
cat("Cronbach's alpha for negative_dimension:", alpha_negative_dimension, "\n")
# Create a data frame for the results
alpha_results <- data.frame(
items = c("spsq_sum", "positive_dimension", "negative_dimension"),
alpha = c(alpha_spsq_sum, alpha_positive_dimension, alpha_negative_dimension)
)
# Write the results to a CSV file
write.csv(alpha_results, 'analysis/cronbachs_alpha.csv', row.names = FALSE)