-
Notifications
You must be signed in to change notification settings - Fork 3
/
1-clean-merge.Rmd
207 lines (144 loc) · 5.25 KB
/
1-clean-merge.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
---
output:
html_document:
toc: true
toc_float: true
---
# Initial data import
```{r setup_1, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
# This file provides the startup() function.
source("R/_startup.R")
# Load desired packages and report any missing packages that should be installed.
startup(auto_install = FALSE, verbose = FALSE)
# Load any additional R files in the R/ directory.
ck37r::load_all_code("R", verbose = TRUE)
```
## Import raw data
```{r import_raw}
# Here "rio::" tells R to use the import() function specifically from the rio package.
# This reminds us of which package supplies import(), and ensures that no other
# package that provides import() could be used accidentally due to package loading order.
data = rio::import("data-raw/heart.csv")
# Check dimensions and variables.
dim(data)
names(data)
# Lowercase variable names.
(names(data) = tolower(names(data)))
```
## Merge data
```{r merge}
# Merge raw dataframes here.
# dplyr::left_join()
```
## Recoding
```{r recode}
# Recode any values here.
```
## Exclusions
Exclude observations if desired.
```{r exclusions}
# Apply any exclusion criteria here.
```
## Categoricals to factors
Ensure that any categorical variables are specified as factors and not numeric/integer variables.
```{r cat_to_factor}
# TODO: treat ordinal variables as ordinal rather than categorical.
data = ck37r::categoricals_to_factors(data,
categoricals = c("sex", "ca", "cp", "slope", "thal"),
verbose = TRUE)
# Inspect the updated data frame
str(data)
```
## Data structure
Specify the outcome variable name, variables excluded from the analysis, and predictor variables (covariates).
```{r data_structure}
vars =
list(
# Variables from exclude from analysis, such as ID fields.
exclude = c(NULL),
# Outcome variables - could be one or multiple (e.g. sensitivity analyses).
outcomes = c("target"),
# Predictor variables will be defined automatically in the next line of code.
predictors = NULL
)
# All remaining variables are considered predictors.
(vars$predictors = setdiff(names(data), c(vars$exclude, vars$outcomes)))
```
## Extreme value review
```{r extreme_values}
# Possibly recode certain extreme values, especially after reviewing the
# predictor summary results (below).
```
## Remove constant predictors
We don't have any constant predictors but good to confirm.
```{r remove_constants}
# Count the unique values in each predictor, excluding NAs.
unique_vals = sapply(data[, vars$predictors, drop = FALSE],
# Make that we don't count NA as a unique value.
function(col_vals) length(setdiff(unique(col_vals), NA)))
# Looks good, no constant columns.
summary(unique_vals)
# Remove constant columns from the covariate file.
constant_columns = vars$predictors[unique_vals < 2L]
if (length(constant_columns) > 0L) {
data = data[, !names(data) %in% constant_columns, drop = FALSE]
vars$predictors = setdiff(vars$predictors, constant_columns)
}
cat("Removed", length(constant_columns), "constant predictors.\n")
rm(constant_columns)
```
## Tutorial-only: add random missingness
Our example dataset has no missingness, so for this tutorial we are artifically adding missingness.
```{r add_missingness}
# Add missingness to certain predictors.
# Number of data cells to set to be missing.
missing_cells = 50L
set.seed(1)
# Randomly sample 20 patients to have a missing value.
miss_rows = sample(nrow(data), missing_cells, replace = TRUE)
# Randomly select X predictors to have missingness.
miss_preds = sample(vars$predictors, missing_cells, replace = TRUE)
miss_df = data.frame(miss_rows, miss_preds, stringsAsFactors = FALSE)
for (row_i in seq(nrow(miss_df))) {
row = miss_df[row_i, , drop = FALSE]
data[row$miss_rows, row$miss_preds] = NA
}
# Confirm that we now have some missing data.
colSums(is.na(data))
```
## Summarize predictors
We will use this to support:
* Review by the team, such as to identify additional cleaning of outliers
* To inform the loss functions used for GLRM interpretation, and
* As a possible table in the manuscript (supplemental info most likely).
```{r predictor_summary}
# Columns: variable name, type, # of unique values, mode, mean, median, min, max, missingness
# Groups: demographic, biomarker, notes, score, clinical history (including family)
vars$groups = list(
demo = c("age", "sex"),
vitals = c("trestbps"),
exam = c("cp", "thalach", "exang", "thal"),
labs = c("chol", "fbs"),
biomarkers = c("restecg", "oldpeak", "slope", "ca")
)
# Note which predictors we consider to be integers.
(vars$integers = NULL)
# Note which predictors are ordinal
(vars$ordinal = NULL)
vars
# Could specify integers and ordinal arguments here.
result = summarize_vars(data, vars = vars$predictors, groups = vars$groups)
# Export as a spreadsheet
# TODO: use prettier variable names for this export and the latex table.
rio::export(result$table, file = "tables/predictor-summary-unimputed.xlsx")
# TODO: output as a kableExtra latex table.
var_df = result$table
data = result$data
```
## Save unimputed dataset {-}
```{r save_unimputed}
# Save both dataframe and the vars list that defines the data structure.
save(data, vars, var_df,
file = "data/clean-merge-unimputed.RData")
```