/
7_covariate_selection_MCV1_delayed.R
353 lines (272 loc) · 11.3 KB
/
7_covariate_selection_MCV1_delayed.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
#Load these libraries
library(Metrics)
library(plyr)
library(xtable)
library(ggplot2)
library(reshape2)
library(MASS)
require(dplyr)
library(car)
library(gtools)
set.seed(500)
#Set working directory
setwd("filepath")
#Read in covariate and outcome data sets
cov_dat <- read.csv("Covariates_The Gambia_2016_2019.csv", header = TRUE)
out_dat <- read.csv("outcome_data.csv", header=TRUE)
head(out_dat)
head(cov_dat)
#Aggregate outcome data file to cluster level
out_dat1 <- group_by(out_dat, DHSCLUST) %>%
summarize(total = n(),
total_mcv1 = length(which(!is.na(mcv1_timely))),
mcv1_timely_count = sum(mcv1_timely, na.rm = TRUE),
mcv1_timely_prop = mcv1_timely_count/total_mcv1,
mcv1_early_count = sum(mcv1_early, na.rm = TRUE),
mcv1_early_prop = mcv1_early_count/total_mcv1,
mcv1_delayed_count = sum(mcv1_delayed, na.rm = TRUE),
mcv1_delayed_prop = mcv1_delayed_count/total_mcv1)
#sd(out_dat1$mcv1_timely_prop)
#sd(out_dat1$mcv1_delayed_prop)
#sd(out_dat1$mcv1_early_prop)
#Merge both the outcome and covariate data using the ID variable
#Keep all locations where covariate data are available
data.merge <- merge(cov_dat, out_dat1, by = "DHSCLUST", all.x = TRUE)
head(data.merge)
#Names of covariates in the covariate data file
cov_names <- names(cov_dat)[-c(1:4)]
#Rows of data with at least one missing covariate value
del <- numeric(nrow(data.merge))
for (i in 1:nrow(data.merge)){
if (any(is.na(data.merge[i,5:33]))) del[i] <- i
}
#Delete rows with missing covariate values
if (length(which(del!=0))>0) data.merge <- data.merge[-which(del!=0),]
#Delete rows of data where no individual was sampled from the data frame
zero.clust <- which(is.na(data.merge$total_mcv1)|data.merge$total_mcv1<=1) #NOTE THAT CLUSTERS WITH SS<=1 HAVE BEEN DELETED
if (length(zero.clust)>0){
data.merge <- data.merge[-zero.clust,]
}
#Outcome variable
total_sampled <- data.merge$total_mcv1 # Total number of vaccinated individuals/children surveyed in each location
num_success <- data.merge$mcv1_delayed_count # Number who had delayed vaccination
#Extract lon-lat coordinates from data frame
dat.coord <- data.merge[,4:3]
#dat_coord <- dat.coord[,c(1,4,3)]
head(dat.coord)
#Data frame with covariates only
covars <- data.merge[,c(5:33)]
###########################################################################
1. #Use histograms to determine which covariates to log-transform
#Ideally, the relationships between the covariates and the empirical
#logit transform of the data shoudl be examined to ensure that the transformation
#improves the linearity
par(mfrow=c(2,5))
for (i in 1:10){
hist(covars[,i], main = names(covars)[i])
}
windows()
par(mfrow=c(2,5))
for (i in 11:20){
hist(covars[,i], main = names(covars)[i])
}
windows()
par(mfrow=c(2,5))
for (i in 21:29){
hist(covars[,i], main = names(covars)[i])
}
#windows()
#par(mfrow=c(2,5))
#for (i in 31:40){
# hist(covars[,i], main = names(covars)[i])
#}
#Take logs of heavily skewed covariates
#1,2,4,7,8,13:15,16,17,27
llog <- c(1,2,4,7,8,13:15,16,17,27)
for (i in 1:length(llog)){
covars[,llog[i]] <- log(covars[,llog[i]] + 0.05) #0.05 added to avoid taking log of zero
#if (i==4) covars[,llog[i]] <- log(covars[,llog[i]] + 4.0)
names(covars)[llog[i]] <- paste0("l_",names(covars)[llog[i]]) #Add "l" to the names of log-transformed covariates
}
#Linearity checks
out_emp_logit <- log(num_success + 0.5/(total_sampled - num_success + 0.5)) #Empirical logit transform
#Data frame for plotting
cdat <- covars
dd <- data.frame(out_emp_logit, cdat)
meltdat=melt(dd, id.vars=c("out_emp_logit"))
#Linear regression lines
plotsingles1=ggplot(meltdat, aes(x=value, y=out_emp_logit, xlab="")) +
geom_point()+
geom_smooth(col="red", method = "loess", span=0.8)+
geom_smooth(method="lm", fill=NA)+
#stat_smooth(method = "gam", formula = y ~ s(x),se=FALSE, col="green")+ #natural splines
labs(x="", y="Empirical logit (mcv1_delay)")+
facet_wrap(~variable, scale="free_x", ncol = 5)+ # #scale=free, free_x, free_y
scale_y_continuous()+
theme(panel.background=element_rect(fill="white", colour="black"))
plotsingles1
#Save plot
pdf("cov_plot_emp_logit_mcv1_delay.pdf", height = 7, width = 7, pointsize = 6)
plotsingles1
dev.off()
#Remove data sets not needed for further processing
rm(meltdat)
rm(cdat)
rm(dd)
#Remove some bad covariates
#covars <- covars[,-c(1,2)]
###Declare categorical covariates (e.g. urban-rural) as factors
covars <- data.frame(covars)
#covars <- within(covars, {
# Urban_rural1 <- factor(Urban_rural1)
#})
#-------------------------Covariate selection starts from here--------------------#
###########################################################################
2. #Fit single covariate models and rank the covariates based on their predictive R-square values
#Create a new data frame for this exercise
Data <- cbind(total_sampled, num_success, covars)
covlist <- names(Data[,-1]) #List of covariates
covlist=covlist[-grep("num_success", covlist)]
##The function below fits single covariate models using a Monte Carlo cross-validation approach
#repeated n.iter times with 20% of the data used for validation each time.
#The validation statistics calculated are averaged over the n.iter repetitions in the end
AICs=dev=nulldev=n.par=r2=pr2=iter=numeric()
model=response=character()
n.iter=10
propsub=0.8
resp="cbind(num_success, total_sampled-num_success)"
resp1 = "num_success"
weights = "total_sampled"
for(i in 1:n.iter){
subind=sample(1:nrow(Data), propsub*nrow(Data), replace=F)
subdat=Data[subind,]
preddat=Data[-subind,]
## the null model, for comparison
nullmod=glm(formula(paste(resp, "~1")), data=Data, family = binomial(logit))
for(j in covlist){
form=formula(paste(resp, "~", j))
pmod=glm(form, data=subdat, family = binomial(logit))
fullmod=glm(form, data=Data, family = binomial(logit))
## pr2 checks the predictive power of the model against a 'new' subset of the data
pr2=c(pr2,cor(pmod$fitted, subdat[[resp1]]/subdat[[weights]])^2)
## AIC for the model on the full data
AICs=c(AICs, AIC(fullmod))
dev=c(dev, deviance(fullmod))
n.par=c(n.par,1)
iter=c(iter,i)
r2=c(r2,cor(fullmod$fitted, Data[[resp1]]/Data[[weights]])^2)
model=c(model,j)
response=c(response, resp1)
#print(paste(i, j, sep="X"))
}
}
op=data.frame(model, response, dev, AICs, r2, pr2, n.par )
singles=ddply(op, .(model), summarise,
model=model[1],
response=response[1],
dev=dev[1],
pr2=mean(pr2),
AICs=AICs[1],
r2=r2[1])
### Add deviance reduction
singles$devred=singles$dev/deviance(nullmod)
## Order by pr2 - predictive R-squared
singles=singles[order(singles$pr2, decreasing=TRUE),]
#Ranks
singles <- within(singles, ranks <- 1:nrow(singles))
singles
#Covariates and ranks
cov.rank <- data.frame(covariate=as.character(singles$model), ranks = singles$ranks)
#NB: You coudl decide to exclude some covariates that didn't perform well after this step
#But for this exercise, we will keep all of them
###
#Separate categorical covariates before this step
#Keep old covars data frame
covars.old <- covars
#Remove urban-rural covariate
#covars <- covars[,-15]
###########################################################################
3. ##Detection of multicollinearity through correlations between covariates and VIF analysis
#Correlations
#Determine correlations between the covariates and extract highly correlated pairs
#for screening and elimination.
#Flag covariate pairs with correlations >= 0.8 #change to a higher value if necessary
corrs <- cor(covars[,-1])
bigcors <- matrix(0,1, 2)
for (i in 2: nrow(corrs)){
for (j in 1:(i-1)){
if (abs(corrs[i,j])>=0.8) bigcors <- rbind(bigcors, c(i,j)) #Note 0.8 cut-off
}
}
bigcors <- bigcors[-1,]
bigcors.dat <- matrix(0, nrow(bigcors), 3)
for (i in 1:nrow(bigcors)){
bigcors.dat[i,] <- c(rownames(corrs)[bigcors[i,1]],colnames(corrs)[bigcors[i,2]],
round(corrs[bigcors[i,1],bigcors[i,2]],3))
}
#Pairs of covariates with high correlations
bigcors.dat
#Select between pairs of highly-correlated covariates using their ranks
all.covs <-rownames(corrs)
for (i in 1:nrow(bigcors.dat)){
#print(i)
name.cov <- bigcors.dat[i,1:2]
if (name.cov[1]%in%all.covs && name.cov[2]%in%all.covs){
r1 <- which(cov.rank$covariate==name.cov[1])
r2 <- which(cov.rank$covariate==name.cov[2])
if (r1>r2) all.covs <- all.covs[-which(all.covs==name.cov[2])]
if (r2>r1) all.covs <- all.covs[-which(all.covs==name.cov[1])]
}
}
#Check that all remaining covariates are not highly correlated
correl <- cor(covars[,all.covs])
correl #Correlation matrix
#Create new data frame containing only "uncorrelated" covariates
covars.1a <- covars[,all.covs]
#Reconstitute "Data" to include categorical covariates (urban-rural)
urban_rural <- data.merge$rural_urba
urban_rural1 <- numeric(length(urban_rural))
urban_rural1[urban_rural=="Urban"] <- 1
#all.covs1 <- c(all.covs,names(covars.old)[15])
covars.1 <- data.frame(covars.1a, urban_rural = urban_rural1)
Data <- cbind(total_sampled, num_success, covars.1)
#Define categorical covariates as factors
Data <- data.frame(Data)
Data <- within(Data, {
urban_rural1 <- factor(urban_rural)
})
#VIF analysis using all the remaining covariates
covnames <- as.character(names(covars.1))
form <- paste("cbind(num_success, total_sampled - num_success)","~", paste(covnames, collapse=" + "))
mod.vif <- glm(form, data=Data, family = binomial(logit))
summary(mod.vif)
vif(mod.vif)
###########################################################################
4. ########################## Method 1
#StepAIC regression - k=2 penatly
n <- nrow(Data)
form <- paste("cbind(num_success, total_sampled - num_success)","~", paste(covnames, collapse=" + "))
fit <- glm(form, data = Data, family = binomial(logit))
mod.step1 <- stepAIC(fit, trace=FALSE, direction = "backward") #Note penalty
summary(mod.step1) #AIC: 3791.1
vif(mod.step1)
4. ########################## Method 2
#StepAIC regression - k=log(n) BIC penalty
fit <- glm(form, data = Data, family = binomial(logit))
mod.step2 <- stepAIC(fit, trace=FALSE, direction = "backward", k = log(n)) #Note penalty
summary(mod.step2)
vif(mod.step2)
##Selected covariates using BIC penalty
GMB_n94_avg_number_WetDays_2016_2019 GMB_n99_avg_modis_Night_time_landsurface_temp_2016.2019_mean
2.847201 2.944535
GMB_n102_avg_potential_evapotranspiration_2016_2019_mean
5.039711
#Write selected covariate information for the next step of the analysis
covout <- c("GMB_n94_avg_number_WetDays_2016_2019",
"GMB_n99_avg_modis_Night_time_landsurface_temp_2016.2019_mean",
"GMB_n102_avg_potential_evapotranspiration_2016_2019_mean",
"urban_rural")
Data.out <- Data[,covout]
Data.out$DHSCLUST <- data.merge$DHSCLUST
Data.out1 <- data.frame(Data.out, dat.coord)
write.csv(Data.out1, "mcv1_delayed_covariates_selected.csv")