-
Notifications
You must be signed in to change notification settings - Fork 19
/
diff-data.R
200 lines (179 loc) · 8.31 KB
/
diff-data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#' Do a data diff
#'
#' Find differences with a reference data set. The diff can be used to \code{\link{patch_data}}, to store the difference
#' for documentation purposes using \code{\link{write_diff}} or to visualize the difference using \code{\link{render_diff}}
#'
#' @example ./examples/datadiff.R
#' @param data \code{data.frame} to check for changes
#' @param data_ref \code{data.frame} reference data frame
#' @param always_show_header \code{logical}
#' Should we always give a table header in diffs? This defaults
#' to TRUE, and - frankly - you should leave it at TRUE for now.
#' @param always_show_order \code{logical}
#' Diffs for tables where row/column order has been permuted may include
#' an extra row/column specifying the changes in row/column numbers.
#' If you'd like that extra row/column to always be included,
#' turn on this flag, and turn off never_show_order.
#' @param columns_to_ignore \code{character}
#' List of columns to ignore in all calculations. Changes
#' related to these columns are ignored.
#' @param count_like_a_spreadsheet \code{logical}
#' Should column numbers, if present, be rendered spreadsheet-style
#' as A,B,C,...,AA,BB,CC? Defaults to TRUE.
#' @param ids \code{character}
#' List of columns that make up a primary key, if known. Otherwise
#' heuristics are used to find a decent key (or a set of decent keys).
#' @param ignore_whitespace \code{logical}
#' Should whitespace be omitted from comparisons. Defaults to FALSE.
#' @param never_show_order \code{logical}
#' Diffs for tables where row/column order has been permuted may include
#' an extra row/column specifying the changes in row/column numbers.
#' If you'd like to be sure that that row/column is *never
#' included, turn on this flag, and turn off always_show_order.
#' @param ordered \code{logical}
#' Is the order of rows and columns meaningful? Defaults to `TRUE`.
#' @param padding_strategy \code{logical}
#' Strategy to use when padding columns. Valid values are "auto",
#' "smart", "dense", and "sparse". Leave null for a sensible default.
#' @param show_meta \code{logical}
#' Show changes in column properties, not just data, if available. Defaults to TRUE.
#' @param show_unchanged \code{logical}
#' Should we show all rows in diffs? We default to showing
#' just rows that have changes (and some context rows around
#' them, if row order is meaningful), but you can override
#' this here.
#' @param show_unchanged_columns \code{logical}
#' Should we show all columns in diffs? We default to showing
#' just columns that have changes (and some context columns around
#' them, if column order is meaningful), but you can override
#' this here. Irrespective of this flag, you can rely
#' on index/key columns needed to identify rows to be included
#' in the diff.
#' @param show_unchanged_meta \code{logical}
#' Show all column properties, if available, even if unchanged.
#' Defaults to FALSE.
#' @param unchanged_column_context \code{integer}
#' When showing context columns around a changed column, what
#' is the minimum number of such columns we should show?
#' @param unchanged_context \code{integer}
#' When showing context rows around a changed row, what
#' is the minimum number of such rows we should show?
#'
#' @return difference object
#'
#' @seealso differs_from
#'
#' @export
diff_data <- function(data_ref,
data,
always_show_header = TRUE,
always_show_order = FALSE,
columns_to_ignore = c(),
count_like_a_spreadsheet = TRUE,
ids = c(),
ignore_whitespace = FALSE,
never_show_order = FALSE,
ordered = TRUE,
padding_strategy = c("auto", "smart", "dense", "sparse"),
show_meta = TRUE,
show_unchanged = FALSE, # rows
show_unchanged_columns = FALSE,
show_unchanged_meta = FALSE,
unchanged_column_context = 1L,
unchanged_context = 1L # rows
)
{
# force padding_strategy to a valid value
padding_strategy <- match.arg(padding_strategy)
if(padding_strategy=='auto') padding_strategy=NULL
# stash object names
source_name <- deparse(substitute(data_ref))
target_name <- deparse(substitute(data ))
# remove specified columns not present in the actual data
cols.either <- union(colnames(data_ref), colnames(data))
ids <- intersect(ids, cols.either)
columns_to_ignore <- intersect(columns_to_ignore, cols.either)
# check for and correct duplicate column names
if(any(duplicated(colnames(data_ref))))
{
colnames(data_ref) <- make.unique(colnames(data_ref))
warning("Column names have been made unique in ", sQuote(source_name), ".")
}
if(any(duplicated(colnames(data))))
{
colnames(data) <- make.unique(colnames(data))
warning("Column names have been made unique in ", sQuote(target_name), ".")
}
ctx <- get_context()
tv <- TableView(ctx, data)
tv_ref <- TableView(ctx, data_ref)
tv_diff <- TableView(ctx)
# add target classes to diff
tv_diff$mode <- sapply(data, storage.mode)
tv_diff$is_factor <- sapply(data, is.factor)
tv_diff$levels <- lapply(data, levels)
# create object to hold options
ctx$eval("cf = new daff.CompareFlags();")
# add scalar options
cf.assign <- function(name, x)
{
ctx$assign("__temp__", x)
ctx$eval( JS(paste0("cf.", name, "=__temp__")) )
}
# add scalar options
cf.assign("always_show_header", always_show_header )
cf.assign("always_show_order", always_show_order ) #!
cf.assign("count_like_a_spreadsheet", count_like_a_spreadsheet)
cf.assign("ignore_whitespace", ignore_whitespace )
cf.assign("never_show_order", never_show_order ) #!
cf.assign("ordered", ordered )
cf.assign("padding_strategy", padding_strategy ) #!
cf.assign("show_meta", show_meta )
cf.assign("show_unchanged", show_unchanged )
cf.assign("show_unchanged_columns", show_unchanged_columns )
cf.assign("show_unchanged_meta", show_unchanged_meta )
cf.assign("unchanged_column_context", unchanged_column_context)
cf.assign("unchanged_context", unchanged_context )
# add vector options
lapply(ids, function(val) ctx$call("cf.addPrimaryKey", val) )
lapply(columns_to_ignore, function(val) ctx$call("cf.ignoreColumn", val) )
# run the diff
diff <- paste0("diff(",tv_ref$var_name,",",tv$var_name,", cf)")
ctx$assign(tv_diff$var_name, JS(diff))
class(tv_diff) <- c("data_diff", class(tv_diff))
# Store summary and flag information for later
summary <- ctx$get(JS(paste0(tv_diff$var_name, ".summary")))
flags <- ctx$get("cf")
# names of the compared objects
summary$source_name = source_name
summary$target_name = target_name
# Textual description of changes to row and column counts
if(summary$row_count_initial == summary$row_count_final)
summary$row_count_change_text <- summary$row_count_initial
else
summary$row_count_change_text <- paste0(summary$row_count_initial, " --> ", summary$row_count_final)
if(summary$col_count_initial == summary$col_count_final)
summary$col_count_change_text <- summary$col_count_initial
else
summary$col_count_change_text <- paste0(summary$col_count_initial, " --> ", summary$col_count_final)
attr(tv_diff, "summary") <- summary
attr(tv_diff, "flags" ) <- flags
tv_diff
}
#' differs from,
#'
#' This is the same function as \code{\link{diff_data}} but with arguments
#' reversed. This is more useful when using \code{dplyr} and \code{magrittr}
#'
#' @param data \code{data.frame} to check for changes
#' @param data_ref \code{data.frame} reference data frame
#' @param ... not further specified
#'
#' @return difference object
#'
#' @seealso diff_data
#'
#' @export
differs_from <- function(data, data_ref, ...){
diff_data(data_ref=data_ref, data=data, ...)
}