-
Notifications
You must be signed in to change notification settings - Fork 0
/
Stratified_Data_Analysis.R
276 lines (224 loc) · 9.67 KB
/
Stratified_Data_Analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
################################################################################
#
# This script contains code written by myself (Dario Domingo) to undertake the
# project: "Role of co-packaging in diarrhoea treatment for children in Zambia".
# The project is part of the "Statisticians for Society" initiative of the Royal
# Statistical Society.
# Github repo: https://github.com/dario-domi/Diarrhoea-Treatment-in-Zambia
#
# The statistical analyses are divided into two parts:
# - Part 1: Analysis of the aggregate data (Aggregate_Data_Analysis.R)
# - Part 2: Analysis of the data as stratified across different health centres
# (this script)
#
# Results of the analyses are discussed in an associated manuscript, about to be
# submitted for publication to PLOS Global Public Health.
#
################################################################################
################################################################################
#
#### PRELIMINARY OPERATIONS (interactive question, variable loading) ####
#
################################################################################
##################################################################
# INTERACTIVE QUESTION TO DECIDE WHETHER TO DIPLAY RESULTS
# Ask the user if they want results to be printed to standard output
cat("\nDo you want summarised results on the stratified analyses",
"to be shown? (y/n)\n")
answer <- readline()
###############################################
# LOAD VARIABLES FROM PREVIOUS SCRIPT
# Set working directory (change as appropriate)
# wd <- "/Users/Durham/Desktop/Academia/Other Projects/Statisticians4Society/Code/"
# setwd(wd)
# Source script to load pre-processed data and additional variables
source("Aggregate_Data_Analysis.R")
#############################################
################################################################################
#
##### STATISTICAL TESTS/ANALYSES, STRATIFIED DATA #####
#
################################################################################
####################################################################
# CONFIDENCE INTERVALS OF PROPORTION (FOR EACH FACILITY)
# For each year (2016 & 2017), create one 7x3 tibble (prop_HC_16 & prop_HC_17).
# The tibble contains the CIs of the proportion of correctly-dispensed cases (CDCs),
# for each of the seven facilities
# Initialise the two tibbles, including central estimate of proportions
facility_props_16 <- df16 %>% mutate(lwr = 0,
mid = 100*CDCs/tot,
upr = 0) %>%
select(facility, lwr, mid, upr) %>%
as.data.frame()
facility_props_17 <- df17 %>% mutate(lwr = 0,
mid = 100*CDCs/tot,
upr = 0) %>%
select(facility, lwr, mid, upr) %>%
as.data.frame()
# Populate lwr and upr columns of CIs in facility_props_16 and facility_props_17
n <- nrow(df16)
for (i in 1:n){
# 2016
xi <- df16$CDCs[i]
ni <- df16$tot[i]
ci <- exactci(xi, ni, conf.level = 0.95)$conf.int
facility_props_16[i, c("lwr", "upr")] <- as.list(100*ci)
#2017
xi <- df17$CDCs[i]
ni <- df17$tot[i]
ci <- exactci(xi, ni, conf.level = 0.95)$conf.int
facility_props_17[i, c("lwr", "upr")] <- as.list(100*ci)
}
# Remove temporary variables
rm(i, xi, ni, ci)
# Print: CI of proportions for each facility
if (answer=="y") {
cat("Proportions of CDCs and 95% Confidence Interval\n")
for (i in 1:7){
x <- facility_props_16[i,]
y <- facility_props_17[i,]
cat(x$facility, "\n")
cat(" 2016:", sprintf("%.3g \t(%.1f, %.1f)\n",
x$mid, x$lwr, x$upr))
cat(" 2017:", sprintf("%.3g \t(%.1f, %.1f)\n",
y$mid, y$lwr, y$upr))
}
cat("\n")
rm(i, x, y)
}
############################################################################
# CONFIDENCE INTERVALS OF DIFFERENCE OF PROPORTION (FOR EACH FACILITY)
# Print CI for difference of proportions, for each facility
if (answer=="y") {
cat("Difference of Proportions and 95% Confidence Interval\n")
for (j in 1:7){
wci <- wald2ci(df17$CDCs[j], df17$tot[j], df16$CDCs[j], df16$tot[j],
conf.level = 0.95, adjust = "Wald")
x <- 100*wci$estimate
y <- 100*wci$conf.int
cat(df16$facility[j], ":\t", sprintf("%.2g (%.1f, %.1f)\n", x, y[1], y[2]), sep="")
}
rm(wci, x, y)
}
#################################################################
# CREATE TABLE WITH COUNTS OF CORRECTLY/INCORRECTLY
# DISPENSED CASES, FOR EACH CENTRE AND YEAR
# First create a 2D data frame, with last variable giving counts of cases
# for each combination of centre x copack(Y/N) x correct_dispensing(Y/N).
# nrows(Treated_Cases) = 7x2x2=28
#
# Then transform into a 3D table (dim: 2x2x7) with counts in each cell.
library(magrittr)
# Prepare empty dataframe
n <- nrow(df16)
Treated_Cases <- data.frame(Facility = rep( df16 %>% pull(facility) , each=4),
Co_pack = rep( c("Y", "Y", "N", "N"), n),
Correct = rep( c("Y", "N", "Y", "N"), n),
Count = as.numeric(rep(NA, 4*n))
)
# Mutate each column into the corresponding factor
Treated_Cases %<>% mutate(Facility = factor(Facility, levels = unique(Facility)),
Co_pack = factor(Co_pack, levels = unique(Co_pack)),
Correct = factor(Correct, levels = unique(Correct))
)
# Populate the dataframe with counts of CDCs,
# looping through facilities and before/after co-pack
for (fac in levels(Treated_Cases$Facility)){ # loop over facilities
for (cp in c("Y", "N")){ # loop over with/without co-pack
# Select appropriate dataframe according to value of cp (co-pack)
if (cp=="Y") df <- df17
else df <- df16
# Extract counts of CDCs and Incorrectly DCs
vals <- df %>% filter(facility==fac) %>%
summarise(CDCs, tot-CDCs) %>%
as.numeric()
# Assign 'vals' to the appropriate position in Treated_Cases$Counts
Treated_Cases %<>% mutate(Count = replace(Count,
Facility==fac & Co_pack==cp,
vals) )
}
}
# Remove temporary variables
rm(fac, cp, vals)
# Transform data frame into 2x2x7 table, for use in CMH test
CDC_Table <- xtabs(Count ~ Correct + Co_pack + Facility, data=Treated_Cases)
#################################################################
# COCHRAN-MANTEL-HAENSZEL TEST
# Perform Cochran-Mantel-Haenszel test
cmh <- mantelhaen.test(CDC_Table, alternative = "t")
# Print result to std output if requested
out <- paste("\nResults of Mantel-Haenszel test:\n",
"H0: \t\t %s = %g\n",
"H1: \t\t %s \n",
"Chi2: \t\t %.4g\n",
"p-value: \t %.1g\n",
"OR estimate: \t %.2f\n",
"OR 95%% CI: \t (%.2f, %.2f)",
"\n"
)
# Print summary of Mantel-Haenszel test results
if (answer=="y") cat(sprintf(out,
names(cmh$null.value), cmh$null.value,
cmh$alternative,
cmh$statistic,
cmh$p.value,
cmh$estimate,
cmh$conf.int[1], cmh$conf.int[2]))
# Check that value of common OR in Mantel-Haenszel test is as expected
a <- 0
b <- 0
for (i in 1:7){
a <- a + CDC_Table[1,1,i]*CDC_Table[2,2,i]/sum(CDC_Table[,,i])
b <- b + CDC_Table[1,2,i]*CDC_Table[2,1,i]/sum(CDC_Table[,,i])
}
a/b; rm(a, b)
#################################################################
# ODDS RATIOS FOR INDIVIDUAL CENTRES AND TOTAL ODDS RATIO
# Table (tibble) to be used for forest plot (see Plots.R)
OR_data <- tibble(facility = df16$facility,
mean = rep(0,7),
lower = rep(0,7),
upper = rep(0,7),
Odds16 = rep("a",7),
Odds17 = rep("b",7),
OR = rep(NA, 7),
empty = rep(NA, 7)
)
# Fill values of the above tibble
for (centre in 1:n){
# 2x2 table with CDCs and not CDCs, before and after co-pack
Tb <- CDC_Table[ , , centre]
# Extract each xi and ni
x1 <- Tb[1, "Y"] # CDCs in 2017
n1 <- sum(Tb[, "Y"]) # tot in 2017
x2 <- Tb[1, "N"] # CDCs in 2016
n2 <- sum(Tb[, "N"]) # tot in 2016
# Odds ratio and Conf. Interval
ci <- orscoreci(x1, n1, x2, n2, conf.level = 0.95)
ci <- as.numeric(ci$conf.int)
OR <- (x1/(n1-x1)) / (x2/(n2-x2)) # exp(mean(log(ci)))
# Create strings of CDCs/(Not CDCs) counts, for 2016 and 2017
s1 <- as.character(x1)
s2 <- as.character(n1-x1)
str17 <- paste0(s1, "/", s2)
s1 <- as.character(x2)
s2 <- as.character(n2-x2)
str16 <- paste0(s1, "/", s2)
# Populate the OR_data tibble
OR_data$mean[centre] <- OR
OR_data$lower[centre] <- ci[1]
OR_data$upper[centre] <- ci[2]
OR_data$OR[centre] <- as.character(signif(OR,2))
OR_data$Odds16[centre] <- str16
OR_data$Odds17[centre] <- str17
}
# Fill in information for aggregate data
OR_summary <- OR_data %>% slice(1)
OR_summary$facility <- "Total"
OR_summary$OR <- as.character(round(cmh$estimate,1))
OR_summary$mean <- cmh$estimate
OR_summary$lower <- cmh$conf.int[1]
OR_summary$upper <- cmh$conf.int[2]
Tb <- apply(CDC_Table, c(1,2), sum)
OR_summary$Odds16 <- paste0( as.character(Tb[1,2]), "/", as.character(Tb[2,2]))
OR_summary$Odds17 <- paste0( as.character(Tb[1,1]), "/", as.character(Tb[2,1]))