/
ds.replaceNA.R
152 lines (141 loc) · 6.55 KB
/
ds.replaceNA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#'
#' @title Replaces the missing values in a server-side vector
#' @description This function identifies missing values and replaces them by a value or
#' values specified by the analyst.
#' @details This function is used when the analyst prefers or requires complete vectors.
#' It is then possible the specify one value for each missing value by first returning
#' the number of missing values using the function \code{ds.numNA} but in most cases,
#' it might be more sensible to replace all missing values by one specific value e.g.
#' replace all missing values in a vector by the mean or median value. Once the missing
#' values have been replaced a new vector is created.
#'
#' \strong{Note}: If the vector is within a table structure such as a data frame the new vector is
#' appended to table structure so that the table holds both the vector with and without
#' missing values.
#'
#' Server function called: \code{replaceNaDS}
#' @param x a character string specifying the name of the vector.
#' @param forNA a list or a vector that contains the replacement value(s), for each study.
#' The length of the list or vector must be equal to the number of servers (studies).
#' @param newobj a character string that provides the name for the output object
#' that is stored on the data servers. Default \code{replacena.newobj}.
#' @param datasources a list of \code{\link{DSConnection-class}} objects obtained after login.
#' If the \code{datasources} argument is not specified
#' the default set of connections will be used: see \code{\link{datashield.connections_default}}.
#' @return \code{ds.replaceNA} returns to the server-side a new vector or table structure
#' with the missing values replaced by the specified values.
#' The class of the vector is the same as the initial vector.
#' @author DataSHIELD Development Team
#' @export
#' @examples
#' \dontrun{
#'
#' ## Version 6, for version 5 see the Wiki
#' # Connecting to the Opal servers
#'
#' require('DSI')
#' require('DSOpal')
#' require('dsBaseClient')
#'
#' builder <- DSI::newDSLoginBuilder()
#' builder$append(server = "study1",
#' url = "http://192.168.56.100:8080/",
#' user = "administrator", password = "datashield_test&",
#' table = "CNSIM.CNSIM1", driver = "OpalDriver")
#' builder$append(server = "study2",
#' url = "http://192.168.56.100:8080/",
#' user = "administrator", password = "datashield_test&",
#' table = "CNSIM.CNSIM2", driver = "OpalDriver")
#' builder$append(server = "study3",
#' url = "http://192.168.56.100:8080/",
#' user = "administrator", password = "datashield_test&",
#' table = "CNSIM.CNSIM3", driver = "OpalDriver")
#' logindata <- builder$build()
#'
#' # Log onto the remote Opal training servers
#' connections <- DSI::datashield.login(logins = logindata, assign = TRUE, symbol = "D")
#'
#' # Example 1: Replace missing values in variable 'LAB_HDL' by the mean value
#' # in each study
#'
#' # Get the mean value of 'LAB_HDL' for each study
#' mean <- ds.mean(x = "D$LAB_HDL",
#' type = "split",
#' datasources = connections)
#'
#' # Replace the missing values using the mean for each study
#' ds.replaceNA(x = "D$LAB_HDL",
#' forNA = list(mean[[1]][1], mean[[1]][2], mean[[1]][3]),
#' newobj = "HDL.noNA",
#' datasources = connections)
#'
#' # Example 2: Replace missing values in categorical variable 'PM_BMI_CATEGORICAL'
#' # with 999s
#'
#' # First check how many NAs there are in 'PM_BMI_CATEGORICAL' in each study
#' ds.table(rvar = "D$PM_BMI_CATEGORICAL",
#' useNA = "always")
#'
#' # Replace the missing values with 999s
#' ds.replaceNA(x = "D$PM_BMI_CATEGORICAL",
#' forNA = c(999,999,999),
#' newobj = "bmi999")
#'
#' # Check if the NAs have been replaced correctly
#' ds.table(rvar = "bmi999",
#' useNA = "always")
#'
#' # Clear the Datashield R sessions and logout
#' datashield.logout(connections)
#' }
#'
ds.replaceNA <- function(x=NULL, forNA=NULL, newobj=NULL, datasources=NULL){
# look for DS connections
if(is.null(datasources)){
datasources <- datashield.connections_find()
}
# ensure datasources is a list of DSConnection-class
if(!(is.list(datasources) && all(unlist(lapply(datasources, function(d) {methods::is(d,"DSConnection")}))))){
stop("The 'datasources' were expected to be a list of DSConnection-class objects", call.=FALSE)
}
if(is.null(x)){
stop("Please provide the name of a vector!", call.=FALSE)
}
# check if the input object is defined in all the studies
isDefined(datasources, x)
# check if replacement values have been provided
if(is.null(forNA)){
stop("Please provide a list of replacement values!", call.=FALSE)
}else{
if(length(forNA) != length(datasources)){
stop("'forNA' must be of the same length as the number of datasources/studies!", call.=FALSE)
}
}
for(i in 1:length(datasources)){
# get the number of missing values for each study and if the number of
# replacement values is not 1 and is greater or smaller than the actual
# number of missing values stop the process and tell the analyst
cally <- call("numNaDS", x)
numNAs <- DSI::datashield.aggregate(datasources[i], cally)
if(length(forNA[[i]]) != 1 & length(forNA[[i]]) != numNAs[[1]]){
message("The number of replacement values must be of length 1 or of the same length as the number of missing values.")
stop(paste0("This is not the case in ", names(datasources)[i]), call.=FALSE)
}
}
if(is.null(newobj)){
newobj <- "replacena.newobj"
}
# call the server side function and doo the replacement for each server
for(i in 1:length(datasources)){
message(paste0("--Processing ", names(datasources)[i], "..."))
cally <- paste0("replaceNaDS(", x, paste0(", vectorDS(",paste(forNA[[i]],collapse=","),")"), ")")
DSI::datashield.assign(datasources[i], newobj, as.symbol(cally))
# check that the new object has been created and display a message accordingly
finalcheck <- isAssigned(datasources[i], newobj)
# if the input vector is within a table structure append the new vector to that table
inputElts <- extract(x)
if(!(is.na(inputElts[[1]]))){
ds.dataFrame(c(inputElts[[1]], newobj), newobj = inputElts[[1]], DataSHIELD.checks = FALSE, datasources = datasources[i])
}
}
}