/
article_vol3.tex
1455 lines (1234 loc) · 75.7 KB
/
article_vol3.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass[article,shortnames]{jss}
\usepackage[utf8]{inputenc}
\usepackage{natbib}
\usepackage{pdfpages}
\usepackage{xspace}
\usepackage{array}
\usepackage{tikz}
\usetikzlibrary{shapes.geometric, arrows}
\tikzstyle{io} = [trapezium, trapezium left angle=70, trapezium right angle=110, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=blue!30]
\tikzstyle{process} = [rectangle, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=orange!30]
\tikzstyle{decision} = [diamond, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=green!30]
\tikzstyle{arrow} = [thick,->,>=stealth]
\newcommand{\hl}[1]{\textcolor{magenta}{#1}}
\newcommand{\blue}[1]{\textcolor{blue}{#1}}
\renewcommand\topfraction{.9}
\renewcommand\textfraction{.1}
\renewcommand{\floatpagefraction}{.6}
\renewcommand{\topfraction}{0.85}
\renewcommand{\bottomfraction}{0.85}
\renewcommand{\textfraction}{0.15}
\renewcommand{\floatpagefraction}{0.7}
%Virker ikke:
%\newcommand{\R}{\proglang{R}\xspace}
\newcommand{\R}[1]{\code{#1}}
\newcolumntype{L}[1]{>{\raggedright\let\newline\\\arraybackslash\hspace{0pt}}p{#1}}
\newcolumntype{C}[1]{>{\centering\let\newline\\\arraybackslash\hspace{0pt}}p{#1}}
\newcolumntype{R}[1]{>{\raggedleft\let\newline\\\arraybackslash\hspace{0pt}}p{#1}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% declarations for jss.cls %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% almost as usual
\author{Anne Helby Petersen\\Section of Biostatistics\\Department of Public
Health\\University of Copenhagen \And Claus Thorn Ekstr\o m\\Section of Biostatistics\\Department of Public
Health\\University of Copenhagen}
\title{\pkg{dataMaid}: your assistant for documenting supervised data quality screening in \proglang{R}}
%% for pretty printing and a nice hypersummary also set:
\Plainauthor{Anne Helby Petersen, Claus Thorn Ekstr\o m} %% comma-separated
\Plaintitle{{dataMaid}: your assistant for documenting supervised data quality screening in R} %% without formatting
\Shorttitle{\pkg{dataMaid}: your assistant for data documentation in R} %% a short title (if necessary)
%% an abstract and keywords
\Abstract{Data cleaning and -validation are important steps in any
data analysis, as the validity of the conclusions from the analysis
hinges on the quality of the input data. Mistakes in the data can
arise for any number of reasons, including erroneous codings,
malfunctioning measurement equipment, and inconsistent data generation
manuals. Ideally, a human investigator should go
through each variable in the dataset and look for potential errors
--- both in input values and codings --- but that process can be very
time-consuming, expensive and error-prone in itself.
We describe an \proglang{R} package, \pkg{dataMaid}, which
implements an extensive and customizable suite of quality
assessment aids that can be applied to a dataset in order to
identify potential problems in its variables. The results are
presented in an auto-generated, non-technical, stand-alone overview
document intended to be perused by an investigator with an
understanding of the variables in the data, but not necessarily
knowledge of \proglang{R}. Thereby, \pkg{dataMaid} aids the dialogue
between data analysts and field experts, while also providing easy
documentation of reproducible data quality screening. Moreover, the
\pkg{dataMaid} solution changes the data screening process from the
usual ad hoc approach to a systematic, well-documented endeavor.
\pkg{dataMaid} also provides a suite of more typical \proglang{R}
tools for interactive data quality assessment and -screening, where
the data inspections are executed directly in the \proglang{R}
console.
% The \pkg{dataMaid} package is designed to be easily extended with
% custom user-created checks that are relevant in particular
% situations. \hl{Already said that above. Either expand on it or
% delete this.}
}
\Keywords{data screening, data cleaning, quality control, \proglang{R}, data documentation}
\Plainkeywords{data screening, data cleaning, quality control, R, data documentation} %% without formatting
%% at least one keyword must be supplied
%% publication information
%% NOTE: Typically, this can be left commented and will be filled out by the technical editor
%% \Volume{50}
%% \Issue{9}
%% \Month{June}
%% \Year{2012}
%% \Submitdate{2012-06-04}
%% \Acceptdate{2012-06-04}
%% The address of (at least) one author should be given
%% in the following format:
\Address{
Claus Thorn Ekstr\o m\\
Section of Biostatistics, Department of Public Health\\
University of Copenhagen\\
Denmark\\
E-mail: \email{ekstrom@sund.ku.dk}\\
URL: \url{http://staff.pubhealth.ku.dk/~ekstrom/}
}
%% It is also possible to add a telephone and fax number
%% before the e-mail in the following format:
%% Telephone: +43/512/507-7103
%% Fax: +43/512/507-2851
%% for those who use Sweave please include the following line (with % symbols):
%% need no \usepackage{Sweave.sty}
%% end of declarations %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{document}
\section{Introduction}
Though data cleaning might be regarded as a somewhat tedious activity,
adequate data cleaning is crucial in any data analysis. With
ever-growing dataset sizes and complexities, statisticians and data
analysts find themselves spending a large portion of their time on
data cleaning and data wrangling. While a computer should generally not
make unsupervised decisions on what should be done to potential
errors in a dataset, it can still be an extremely useful tool in the
data cleaning process. Some errors can be tracked down and flagged by a
computer without further ado, while other types of errors need a subject
context in order to be identified. Even in this latter case, well-designed
software can aid the process tremendously by providing the human
investigator with the information needed for identifying issues.
But even when tools are available for identifying problems in a
dataset, the activity of data cleaning still suffers from a challenge
that has received increasing attention in the scientific communities
in the later years: Data cleaning is not very straight forward to
document and therefore, reproducibility suffers. We present a new
\proglang{R} package, \pkg{dataMaid} \citep{dataMaid}, whose most
central purpose is to facilitate a supervised data quality screening
workflow where documentation is thoroughly integrated rather than an
add-on. This is accomplished by structuring the data screening around
auto-generated data overview reports that summarize and flags
potential problems in the dataset.
But no matter how clever software tools we make, data cleaning remains
to be a time consuming endeavor, which inherently requires human
interaction since every dataset is different and the variables in the
dataset can only be understood in the proper context of their
origin. This often requires a collaborative effort between an expert
in the field and a statistician or data scientist. In many
situations, these errors are discovered in the process of the data
analysis (e.g., a categorical variable with numeric labels for each
category may be wrongly classified as a quantitative variable or a
variable where all values have erroneously been coded to the same
value), but in other cases a human with knowledge about the data
context area is needed to identify possible mistakes in the data
(e.g., if there are 4 categories for a variable that should only have
3).
The \pkg{dataMaid} approach to data screening, -quality assessment and
-documentation is governed by two fundamental paradigms. First of all,
there is no need for data cleaning to be an ad hoc procedure. Often,
we have a very clear idea of what flags are raisable in a given
dataset before we look at it, as we were the ones to produce it in the
first place. This means that data cleaning can easily be a
well-documented, well-specified procedure. In order to aid this
paradigm, \pkg{dataMaid} provides easy-to-use, automated tools for
data quality assessment in \proglang{R} \citep{R} on which data
cleaning decisions can be made. This quality assessment is presented
in an auto-generated overview document, readable by data analysts and
field experts alike, thereby also contributing to an inter-field
dialogue about the data at hand. Oftentimes, e.g., distinguishing
between faulty codings of a numeric value and unusual, but correct,
values requires problem-specific expertise that might not be held by
the data analyst. Hopefully, having easy access to data summaries
through \pkg{dataMaid} will help this necessary knowledge sharing.
While \pkg{dataMaid}s primary raison d'être is auto-generating data
quality assessment overview documents, we still wish to emphasize that
it is \emph{not} a tool for unsupervised data cleaning. This qualifies
as the second paradigm of \pkg{dataMaid}: Data cleaning decisions
should always be made by humans. Therefore, \pkg{dataMaid} does not
supply any tools for ``fixing'' errors in the data. However, we do
provide interactive functions that can be used to identify potentially
erroneous entries in a dataset and that can make it easier to solve
data issues, one variable at a time.
A number of \proglang{R} packages made for other pre-analysis steps
are already available, including \pkg{janitor} \citep{janitor},
\pkg{assertive} \citep{assertive}, \pkg{dplyr} \citep{dplyr},
\pkg{tidyr}\citep{tidyr}, \pkg{data.table} \citep{data.table},
\pkg{DataCombine} \citep{DataCombine}, \pkg{validate}
\citep{validate}, and \pkg{assertr} \citep{assertr}. These packages
focus on different stages of the pre-analysis work. \pkg{janitor}
provides tools for data import with a particular emphasis on the
challenges of getting neat data frames from Microsoft Excel data
files. \pkg{dplyr}, \pkg{tidyr}, \pkg{data.table} and
\pkg{DataCombine} go a few steps further by providing a wide array of
extremely powerful tools for data wrangling, including a number of
particularly useful functions for merging and working with very large
datasets. When it comes to actual data cleaning, however, the options
are fewer. \pkg{validate} (and the similar packages \pkg{editrules}
\citep{editrules} and \pkg{deducorrect} \citep{deducorrect} from the
same authors) and \pkg{assertive} offer tools for identifying errors
in a dataset by checking the state of the variable given a set of
pre-specified rules, and their focus is on internal validity rather
than general data screening. In practice, this means that quite elegant
tools for, e.g., linear restraints among the variables in a dataset
can be applied, but looking for potentially miscoded missing values is
not really feasible. The main difference between these two challenges
is the direction in which the data is inspected: While linear
constraints work observation-wise with no ambiguity, determining
whether or not something is a miscoded missing value often requires
knowledge about the full variable (e.g. range or data type), and thus
it should be performed variable-wise. \pkg{validate} does not
currently allow for user-defined extensions of the latter type,
thereby limiting its data cleaning potential. Automatic data
correction functions are also provided by \pkg{validate} which we
consider to be quite a dangerous cocktail: all power is given to the
the computer with no human supervision, and investigators are less
likely to make an active, case-specific choice regarding the handling
of the potential errors. Finally, no tools have been made to easily
document exactly which checks and preliminary results were used in the
data cleaning process. The \pkg{assertr} package provide very similar
--- and very nice --- tools to those of \pkg{validate}, but without
any ambitions of conducting auto-cleaning.
%All in all, the large role of data cleaning in any data analysts
%everyday endeavors is hardly matched in the amount of available
%\proglang{R} software solutions. In particular, few packages attempt
%to implement systematic, reproducible data cleaning. And while the
%available tools attempt to alleviate the ubiquitous ad hoc approach to
%data cleaning, they are primarily intended for the data savvy users
%and less so for the general researcher with a knowledge about a
%specific field and the context of the available data. The
%\pkg{dataMaid} package \citep{dataMaid} presented here tries to address this by
%providing a framework that both allows for extendable, systematic,
%reproducible data cleaning, and summarizing findings for researchers
%from other fields such that they can act as human experts when
%tracking down potential errors.
%Jeg har slettet dette: Synes det dækkes ok af det nye introafsnit før præsentationen af andre pakker og så er introduktionen ikke blevet længere alt i alt.
One last package that should be mentioned in this context is
\pkg{DataExplorer} \citep{DataExplorer}. While this package does not
address data cleaning issues \emph{per se}, its general strategy is
quite similar to that of \pkg{dataMaid} and to the paradigms presented
above. This package provides a few simple, but practical, tools for
exploratory data analysis, including automated
documentation. Therefore, we find \pkg{DataExplorer} to be a good
candidate for a next-step package after data cleaning is finished.
This manuscript is structured as follows: First, in Section
\ref{sec:usingdataMaid}, we introduce the main representative of the first
paradigm, namely the \code{makeDataReport()} function, which generates data
overview documents. In the \pkg{dataMaid} package, we have
provided a number of default generic checks that cover the data
cleaning challenges we find to be most common and these are also summarized in Section \ref{sec:usingdataMaid}. Next, in Section
\ref{sec:interactiveCleanR}, we present the interactive mode of \pkg{dataMaid}, as motivated
by the second paradigm above. Next, we show step-by-step how the data report mode and the interactive mode of \pkg{dataMaid} can be combined to conduct a well-documented, systematic data cleaning in Section \ref{sec:bigExample}. Here, we assess and clean a dirty dataset with information about the US presidents. At last, in Section \ref{sec:specificExamples}, we discuss a number of examples of
specific data cleaning- and documentation challenges and how \pkg{dataMaid} can be used to
solve them.
%Please note that this description of what \pkg{dataMaid} contains is
%usefully supplemented by a tutorial into how it can be extended. Every
%dataset is different, and some datasets might include problems that
%cannot be detected by our data checking functions.
%Therefore,
\pkg{dataMaid} was designed to be easily extended with user-supplied
functions for summarizing, visualizing and checking data. In the package, we have
provided a vignette in which we describe how \pkg{dataMaid} extensions
can be made, such that they are integrate with the
\code{makeDataReport()} function and with the other tools available in
\pkg{dataMaid}.
%% include your article here, just as usual
%% Note that you should use the \pkg{}, \proglang{} and \code{}
%% commands.
\section{Creating a data overview report}
\label{sec:usingdataMaid}
The \code{makeDataReport()} function is the primary workhorse of
\pkg{dataMaid} and it is the only function needed to generate a data
report using the standard battery of tests. The data report itself is
an overview document, intended for reading by humans, in either pdf,
html or word (.docx) format. Appendix \ref{sec:appendix1} provides an
example of a data report, produced by calling \code{makeDataReport()}
on the dataset \code{toyData} available in \pkg{dataMaid}. The first
two pages (excluding the front page) of this data report are shown in
Figure~\ref{fig:example1} and the following two pages are shown in
Figure~\ref{fig:example2}. \code{toyData} is a very small ($15$
observations of $6$ variables), artificial dataset which was created
with a lot of potential errors to illustrate the main capabilities of
\pkg{dataMaid}. Section~\ref{sec:bigExample} shows an example of a data screening
process with a real dataset. The following commands load the dataset
and produce the report:
\begin{figure}[tb]
\begin{center}
\frame{\includegraphics[width=7.5cm,page=2]{dataMaid_toyData_appA.pdf}}
\frame{\includegraphics[width=7.5cm,page=3]{dataMaid_toyData_appA.pdf}}
%\includepdf[pages={2}, pagecommand={}]{dataMaid_testData.pdf}
\end{center}
\caption{The two first pages of the report created by running \code{makeDataReport()} on the \code{toyData}
dataset. First, a summary of the full dataset is given along with an overview of what checks were performed. Next, a summary of all the variables and whether or not they are problematic is provided. Larger versions of the pages can be seen in
Appendix~\ref{sec:appendix1}.}
\label{fig:example1}
\end{figure}
\begin{figure}[tb]
\begin{center}
\frame{\includegraphics[width=7.5cm,page=4]{dataMaid_toyData_appA.pdf}}
\frame{\includegraphics[width=7.5cm,page=5]{dataMaid_toyData_appA.pdf}}
%\includepdf[pages={2}, pagecommand={}]{dataMaid_testData.pdf}
\end{center}
\caption{The third and fourth pages of the report created by running \code{makeDataReport()} on the \code{toyData}
dataset. Here, we see a description of each variable in the dataset, consisting of a summary table, a visualization and an indication of what problems were flagged for the variable (if any). At last, a few lines of metadata about the \code{makeDataReport()} are included for enhancing reproducibility. Larger versions of the pages can be seen in
Appendix~\ref{sec:appendix1}.}
\label{fig:example2}
\end{figure}
\begin{Schunk}
\begin{Sinput}
R> library("dataMaid")
R> data("toyData")
R> toyData
\end{Sinput}
\begin{Soutput}
# A tibble: 15 x 6
pill events region change id spotifysong
<fctr> <dbl> <fctr> <dbl> <fctr> <fctr>
1 red 1 a -0.6264538 1 Irrelevant
2 red 1 a 0.1836433 2 Irrelevant
3 red 1 a -0.8356286 3 Irrelevant
4 red 2 a 1.5952808 4 Irrelevant
5 red 2 a 0.3295078 5 Irrelevant
6 red 6 b -0.8204684 6 Irrelevant
7 red 6 b 0.4874291 7 Irrelevant
8 red 6 b 0.7383247 8 Irrelevant
9 red 999 c 0.5757814 9 Irrelevant
10 red NA c -0.3053884 10 Irrelevant
11 blue 4 c 1.5117812 11 Irrelevant
12 blue 82 . 0.3898432 12 Irrelevant
13 blue NA -0.6212406 13 Irrelevant
14 <NA> NaN other -2.2146999 14 Irrelevant
15 <NA> 5 OTHER 1.1249309 15 Irrelevant
\end{Soutput}
\end{Schunk}
\begin{Schunk}
\begin{Sinput}
R> makeDataReport(toyData)
\end{Sinput}
\end{Schunk}
By default, an \proglang{R} markdown file and a rendered pdf, word or html
overview document is produced, saved to the working directory and
opened for immediate inspection. Such a data report consists of three
parts, two of which are presented in Figure~\ref{fig:example1}. First,
an overview of what was done is presented under the title \textit{Data
report overview}. Secondly, an index listing each variable along
with an indication of whether it was found to be problematic or not is
provided. Thirdly, as seen in Figure~\ref{fig:example2}, each variable
in the dataset is presented in turn using (up to) three tools in the
\textit{Variable list}: A table summarizing key features of the
variable, a figure visualizing its distribution when relevant and a list of flagged
issues, if any. For instance, as shown in Figure~\ref{fig:example2},
for the \code{numeric}-type variable \code{events} from
\code{toyData}, \code{makeDataReport()} has identified two values that
are suspected to be miscoded missing values (\code{999} and
\code{NaN}), while two values were also flagged as potential outliers
that should be investigated more carefully.
The arguments to \code{makeDataReport()} can be used to modify the
contents and the look of the data report according to the user's
needs. The most commonly used arguments are summarized in
Table~\ref{table.cleanFormals} and they are grouped according to the
part of the data assessment and report generation they influence. In
order to understand this distinction, a glimpse of the inner structure
of \code{makeDataReport()} is shown in
Figure~\ref{figure:cleanStructure}. Below, we present a few examples
on how to use the arguments from Table \ref{table.cleanFormals} to
influence the output of a \code{makeDataReport()} call.
\begin{table}
\small
\begin{tabular}{p{0.25\linewidth}p{0.45\linewidth}p{0.2\linewidth}}
\hline
Argument & Description & Default value \\
\hline
\smallskip Control input variables, looks and meta information\\
\quad \code{useVar} & What variables should be used? & \code{NULL} (corresponding to all variables) \\
\quad \code{ordering} & Ordering of the variables in the data summary (as is or alphabetical) & \code{"asIs"} \\
\quad \code{onlyProblematic} & Should only variables flagged as problematic be included in the \textit{Variable list}? & \code{FALSE} \\
%\quad \code{listChecks} & Should an overview of what checks were performed be listed in the \textit{Data report overview}? & \code{TRUE} \\
%slettet for at få plads til nye argumenter
\quad \code{preChecks} & What check functions should be called to determine whether a variable is suitable for summarization, visualization and checking? & \code{c("isKey", "isSingular", "isSupported")} \\
\quad \code{reportTitle} & What should the title displayed on the front page of the report be? & \code{NULL} (corresponds to the dataset name) \\
\quad \code{twoCol} & Should the summary table and visualizations be placed side-by-side (in two columns)? & \code{TRUE} \\
\smallskip Control summarize, visualize, and check steps \\
\quad \code{summaries} & What summaries should be performed for each variable type? & See Table~\ref{table.SVCfunctions} \\
\quad \code{visuals} & What type of visualization should be provided for each variable type? & See Table~\ref{table.SVCfunctions} \\
\quad \code{checks} & What checks should be applied to each variable type? & See Table~\ref{table.SVCfunctions} \\
\quad \code{mode} & What steps should be performed for each variable (out
of the three possibilities \textit{summarize},
\textit{visualize}, \textit{check})? &
\code{c("summarize", "visualize", "check")} \\
\quad \code{smartNum} & Should numerical values with only a few unique
levels be flagged and treated as a factor variable? & \code{TRUE} \\
\quad \code{maxProbVals} & Maximum number of problematic values to print, if any are found in data checks & \code{10} \\
\quad \code{maxDecimals} & Maximum number of decimals to print for numeric values in the variable list & \code{2} \\
\quad \code{treatXasY} & How should non-supported variable classes be handled? & \code{NULL} (no handling) \\
%Takes a list on the format \code{list(X = Y)} where \code{X} is a non-supported variable class (e.g. \code{complex}) and \code{Y} is a supported one (e.g. \code{numeric}) & \code{NULL} (No rules for handling non-supported variable types) \\
\smallskip Control output and post-processing \\
\quad \code{output} & Type of output file to be produced (html, word (.docx) or pdf) & \code{NULL} (use pdf if \LaTeX\ is found, otherwise Word (if on Windows), or html)\\
\quad \code{render} & Should the output file be rendered from markdown? & \code{TRUE} \\
\quad \code{openResult} & If a pdf/html file is rendered, should it
automatically open afterwards, and if not,
should the \code{rmarkdown} file be opened? & \code{TRUE} \\
\quad \code{replace} & Overwrite an existing file with the same name? & \code{FALSE} \\
\quad \code{vol} & Add a suffix to the file name of the outputted report & \code{""} (no suffix)\\
\hline
\end{tabular}
\caption{A selection of commonly used arguments to \code{makeDataReport()} separated into the parts they control.}
\label{table.cleanFormals}
\end{table}
\begin{figure}[tb]
% Define block styles
\tikzstyle{decision} = [diamond, draw, fill=blue!20,
text width=5.5em, text badly centered, node distance=3cm, inner
sep=0pt]
\tikzstyle{block} = [rectangle, draw, fill=blue!20,
text width=6em, text centered, rounded corners, minimum
height=4em]
\tikzstyle{line} = [draw, -latex']
\tikzstyle{cloud} = [draw, ellipse,fill=red!20, node distance=3cm,
text width=5em,
minimum height=2em]
\begin{center}
\begin{tikzpicture}[node distance = 2cm, auto,thick,scale=0.75, every node/.style={transform shape}]
% Place nodes
\node [block] (init) {Get next variable and run pre-checks};
\node [cloud, above of=init] (input) {Input \code{data.frame} or \code{tibble}};
% \node [cloud, right of=init] (system) {system};
\node [decision, right of=init, node distance=4cm] (precheck) {Is variable suitable for inclusion};
\node [block, right of=precheck, node distance=4cm] (summarize)
{Run \code{summarize()} to produce summary table};
\node [block, below of=summarize, node distance=3cm] (visualize)
{Run \code{visualize()} to plot variable};
\node [block, below of=visualize, node distance=2.5cm] (check)
{Call \code{check()} to run error checks};
\node [decision, below of=check, node distance=2.7cm] (done) {More
variables?};
\node [block, right of=done, node distance=3.5cm] (stop) {Write
\proglang{R} markdown file};
\node [cloud, below of=stop, node distance=3.5cm] (render) {Render
markdown and possibly open};
% Draw edges
\path [line] (summarize) -- (visualize);
\path [line] (visualize) -- (check);
\path [line] (check) -- (done);
\path [line] (done.south) -- +(0,-10pt) -| node [near start] {yes} (init);
% \path [line] (identify) -- (evaluate);
% \path [line] (evaluate) -- (decide);
% \path [line] (init) -| node [near start] {yes} (precheck);
\path [line] (init) -- (precheck);
\path [line] (precheck) -- node [near start] {yes} (summarize);
\path [line] (precheck) |- node [near start] {no} (done);
\path [line] (done) -- node [near start] {no} (stop);
% \path [line] (update) |- (identify);
% \path [line] (decide) -- node {no}(stop);
\path [line,dashed] (input) -- (init);
\path [line] (stop) -- (render);
% \path [line,dashed] (system) -- (init);
% \path [line,dashed] (system) |- (evaluate);
\end{tikzpicture}
\end{center}
\caption{Schematic illustration of the stages undertaken when running
\code{makeDataReport()}. Each variable is checked for eligibility before
running \code{summarize()}, \code{visualize()}, and \code{check()}, and the
resulting \proglang{R} markdown file may be rendered and opened.}
\label{figure:cleanStructure}
\end{figure}
% \subsection{Controlling [something]}
% \label{subsection:controlSomething}
\subsection{Dusting off the arguments}
We begin with an example that is intended as an illustration of how
\code{makeDataReport()} might be used in the very first stages of data
cleaning, when we are uncertain about the complexities of the
errors and how much time should be allocated to data cleaning. At
this stage, what is really needed, is a very rough idea of the
severity of errors in the dataset. In this scenario, we might wish to
obtain a summary document in html format that only contains the
variables with potential problems, and with a limit of, say, maximum
2 printed potential problematic values per check for each variable. Also, we can add the
argument \code{replace = TRUE} in order to force \code{makeDataReport()} to
overwrite any existing files produced by \code{makeDataReport()}. Using the
\code{toyData} dataset as a guinea pig, we type:
\begin{Schunk}
\begin{Sinput}
R> makeDataReport(toyData, output = "html", onlyProblematic = TRUE,
+ maxProbVals = 2, replace = TRUE)
\end{Sinput}
\end{Schunk}
The final rendering of the generated markdown file is controlled by
the \code{render} and \code{openResult} arguments, which both default to
\code{TRUE}. \code{render} determines if the \proglang{R} markdown file
produced should be rendered using the \pkg{rmarkdown} \citep{rmarkdown} package and
\code{openResult} decides whether the outputted file should be
opened. The following command produces an \proglang{R} markdown file
containing the information needed for generating a data report, but without
rendering nor opening the markdown file:
\begin{Schunk}
\begin{Sinput}
R> makeDataReport(toyData, output="html", render=FALSE,
+ openResult=FALSE, replace=TRUE)
\end{Sinput}
\end{Schunk}
\begin{table}
\centering
\begin{tabular}{p{0.35\linewidth} p{0.3\linewidth} p{0.01\linewidth} p{0.01\linewidth} p{0.01\linewidth} p{0.01\linewidth} p{0.01\linewidth}
p{0.01\linewidth} p{0.01\linewidth}}
\hline
& Description & \multicolumn{7}{c}{Variable classes} \\ \smallskip
& & C & F & I & L & B & N & D\\
\hline \smallskip
\textbf{\code{summaryFunction}s} \smallskip \\
\quad \code{centralValue} & Compute median for numeric variables, mode for categorical variables & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} \\
\quad \code{countMissing} & Compute proportion of missing observations & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} \\
\quad \code{minMax} & Find minimum and maximum values & & & \blue{$\times$} & & & \blue{$\times$} & \blue{$\times$} \\
\quad \code{quartiles} & Compute 1st and 3rd quartiles & & & \blue{$\times$} & & & \blue{$\times$} & \blue{$\times$} \\
\quad \code{uniqueValues} & Count number of unique values & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} \\
\quad \code{variableType} & Data class of variable & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} \\
\smallskip \\
\textbf{\code{visualFunction}s} \smallskip \\
\quad \code{basicVisual} & Histograms and barplots using base \proglang{R} graphics & $\times$ & $\times$ & $\times$ & $\times$ & $\times$ & $\times$ & $\times$ \\
\quad \code{standardVisual} & Histograms and barplots using \pkg{ggplot2} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} \\
\smallskip \\
\textbf{\code{checkFunction}s} \smallskip \\
\quad \code{identifyCaseIssues} & Identify case issues & \blue{$\times$} & \blue{$\times$} & & \blue{$\times$} & & & \\
\quad \code{identifyLoners} & Identify levels with $<$ 6 obs. & \blue{$\times$} & \blue{$\times$} & & \blue{$\times$} & & & \\
\quad \code{identifyMissing} & Identify miscoded missing values & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \\
\quad \code{identifyNums} & Identify misclassified numeric or integer variables & \blue{$\times$} & \blue{$\times$} & & \blue{$\times$} & & & \\
\quad \code{identifyOutliers} & Identify outliers & & & \blue{$\times$} & & \blue{$\times$} & \blue{$\times$} \\
\quad \code{identifyOutliersTBStyle} & Identify outliers (Turkish Boxplot style) & & & $\times$ & & $\times$ & $\times$ \\
\quad \code{identifyWhitespace} & Identify prefixed and suffixed white space & \blue{$\times$} & \blue{$\times$} & & \blue{$\times$} & & & \\
\quad \code{isCPR} & Identify Danish CPR numbers & $\times$ & $\times$ & $\times$ & $\times$ & $\times$ & $\times$ &$\times$ \\
\quad \code{isSingular} & Check if the variable contains only a single value & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} \\
\quad \code{isKey} & Check if the variable is a key & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} \smallskip \\
\quad \code{isSupported} & Check if the variable is among the supported variable types & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} & \blue{$\times$} \smallskip \\
\hline
\end{tabular}
\caption{Overview of all summary-, visual- and check functions currently implemented in \pkg{dataMaid}. The variable
classes C, F, I, L, B, N, and D, refer to \code{character}, \code{factor},
\code{integer}, \code{labelled}, \code{logical} (boolean), \code{numeric}, and \code{Date}, respectively. The default settings of \code{makeDataReport()} are marked in blue.}
\label{table.SVCfunctions}
\end{table}
\subsection{Controlling contents through summaries, visualizations and checks}
\pkg{dataMaid} works through three different steps --- summarize,
visualize, and check (SVC) --- for each variable in the dataset
(illustrated in Figure \ref{figure:cleanStructure}). Three different
types of functions are used to perform these steps, namely
\code{summaryFunction}s, \code{visualFunction}s and
\code{checkFunction}s. By default, \code{makeDataReport()} runs
selected summary, visualization and check functions on each variable
in the dataset, and the exact choice of these functions depends on the
classes of the variables. For instance, detection of outlier values
might be interesting for numerical variables, but it holds little
meaning for factor variables, and therefore, numerical and factor
variables need different checks. Table~\ref{table.SVCfunctions} lists
all available summarize/visualize/check functions, but we can also use
the \code{allSummaryFunctions()}, \code{allVisualFunctions()}, and
\code{allCheckFunctions()} functions in \pkg{dataMaid} to print
overview lists in \proglang{R}. For example, the implemented
\code{summaryFunction}s are:
%\subsection{Something about what check, visual and summary functions are available}
\begin{Schunk}
\begin{Sinput}
R> allSummaryFunctions()
\end{Sinput}
\begin{Soutput}
----------------------------------------------------------------------------
name description classes
-------------- ------------------------------- -----------------------------
centralValue Compute median for numeric character, Date, factor,
variables, mode for integer, labelled, logical,
categorical variables numeric
countMissing Compute proportion of missing character, Date, factor,
observations integer, labelled, logical,
numeric
minMax Find minimum and maximum integer, numeric, Date
values
quartiles Compute 1st and 3rd quartiles Date, integer, numeric
uniqueValues Count number of unique values character, Date, factor,
integer, labelled, logical,
numeric
variableType Data class of variable character, Date, factor,
integer, labelled, logical,
numeric
----------------------------------------------------------------------------
\end{Soutput}
\end{Schunk}
Thus we can see, for example, that for \code{numeric}, \code{integer},
and \code{Date} variables, \pkg{dataMaid} provides functions for
adding summary information about the minimum and maximum values, while
all seven variable classes dealt with in \pkg{dataMaid} have functions
for central tendency summaries (i.e., mode or median).
We can control what summaries and checks are applied for each variable type
through the \code{summaries}, \code{visuals} and \code{checks} arguments of \code{makeDataReport()}. Each of these arguments takes a list with one entry for each variable type and a number of function names for each such entry. The easiest way to specify the arguments is by use of the built-in helper functions \code{setSummaries()}, \code{setVisuals()} and \code{setChecks()} that contain the default settings of \code{makeDataReport()} and simple syntaxes for making small alterations of these default settings. We can inspect the default settings for summaries by calling:
\begin{Schunk}
\begin{Sinput}
R> setSummaries()
\end{Sinput}
\begin{Soutput}
$character
[1] "variableType" "countMissing" "uniqueValues" "centralValue"
$factor
[1] "variableType" "countMissing" "uniqueValues" "centralValue"
$labelled
[1] "variableType" "countMissing" "uniqueValues" "centralValue"
$numeric
[1] "variableType" "countMissing" "uniqueValues" "centralValue"
[5] "quartiles" "minMax"
$integer
[1] "variableType" "countMissing" "uniqueValues" "centralValue"
[5] "quartiles" "minMax"
$logical
[1] "variableType" "countMissing" "uniqueValues" "centralValue"
$Date
[1] "variableType" "countMissing" "uniqueValues" "centralValue"
[5] "minMax" "quartiles"
\end{Soutput}
\end{Schunk}
This helper function really just calls several other helper functions, namely the \\
\code{defaultXXXSummaries()} functions, where \code{XXX} refers to a variable class. For instance, we can see the default character summaries by calling \code{defaultCharacterSummaries()}:
\begin{Schunk}
\begin{Sinput}
R> defaultCharacterSummaries()
\end{Sinput}
\begin{Soutput}
[1] "variableType" "countMissing" "uniqueValues" "centralValue"
\end{Soutput}
\end{Schunk}
We can change the choice of summaries (and similarly the checks and visual functions) by setting the
corresponding arguments when calling \code{makeDataReport()}. For example, to get
only the variable type and the central tendency listed in the summary
table for numeric and integer variables, we write
\begin{Schunk}
\begin{Sinput}
R> makeDataReport(toyData, replace=TRUE,
+ summaries = setSummaries(numeric = c("variableType","centralValue"),
+ integer = c("variableType", "centralValue")))
\end{Sinput}
\end{Schunk}
In the case where we specify the same set of summary
functions for each variable type, we can use a simpler argument for \code{setSummaries} which overrides the summary functions for all
variable types:
\begin{Schunk}
\begin{Sinput}
R> makeDataReport(toyData, replace=TRUE,
+ summaries = setSummaries(all = c("variableType", "centralValue")))
\end{Sinput}
\end{Schunk}
Similarly, the checks applied are set with the \code{checks} argument and the \code{setChecks} function. The default checks being applied to a factor are
\begin{Schunk}
\begin{Sinput}
R> defaultFactorChecks()
\end{Sinput}
\begin{Soutput}
[1] "identifyMissing" "identifyWhitespace" "identifyLoners"
[4] "identifyCaseIssues" "identifyNums"
\end{Soutput}
\end{Schunk}
Now, if we only wanted to apply the function to identify white space
for factor variables, then we would need provide this information for \code{setChecks()}:
\begin{Schunk}
\begin{Sinput}
R> makeDataReport(toyData, replace=TRUE,
+ checks = setChecks(factor = "identifyWhitespace"))
\end{Sinput}
\end{Schunk}
or we could remove checks for factors altogether by setting the
corresponding argument to \code{NULL}, in which case factor variables will
not be checked for any potential errors:
\begin{Schunk}
\begin{Sinput}
R> makeDataReport(toyData, checks = setChecks(factor = NULL), replace=TRUE)
\end{Sinput}
\end{Schunk}
As with \code{summaryFunction}s, a complete list of available
\code{checkFunction}s is obtained by calling
\code{allCheckFunctions()}. Note however, that \code{checkFunction}s have a
usage beyond the \code{checks} arguments, namely in the
\textit{pre-check} stage. In this stage, it is determined whether or
not each variable is suitable for the summarize/visualize/check (SVC)
steps. The functions used in the pre-check stage should be
\code{checkFunction}s that are applicable to all variable classes. The
default pre-checks, the functions \code{isKey()}, \code{isSingular()} and \code{isSupported()}, check
whether a variable has unique values for all observations, only a
single value for all observations, and is not among the variable types supported by \pkg{dataMaid}, respectively. If one of these
statements are true, the variable will not be subjected to the SVC
steps. We can allow singular variables to move on to the SVC step by
only checking for keys and non-supported variables in the pre-check step:
\begin{Schunk}
\begin{Sinput}
R> makeDataReport(toyData, preChecks = c("isKey", "isSupported"),
+ replace=TRUE)
\end{Sinput}
\end{Schunk}
Note that the data visualizations in the report are also
controllable, though only a single function can be provided for each variable type. If, for instance, we wish to change the visualizations
from the default \pkg{ggplot2} \citep{ggplot2} style histograms and barplots to base
\proglang{R} histograms and barplots, we type
\begin{Schunk}
\begin{Sinput}
R> makeDataReport(toyData, visuals = setVisuals(all = "basicVisual"),
+ replace=TRUE)
\end{Sinput}
\end{Schunk}
In summary, and as indicated in Figure~\ref{figure:cleanStructure}, there are two stages
where \code{makeDataReport()} applies functions to each of the variables:
\begin{enumerate}
\item In the pre-check stage.
\item As part of the summarize/visualize/check (SVC) steps.
\end{enumerate}
Each of these stages are controllable using appropriate function
arguments in \code{makeDataReport()}, and above we have shown examples
of how to tweak them to modify the data cleaning outputs. However, if
the dataset at hand requires new, additional checks, then more control
is needed. The package contains a vignette that explains the details of how to modify and
expand the possibilities by producing new summary, visual, and check
functions.
One might also encounter datasets with variables that are not among the 7 classes mentioned in the above (\code{character}, \code{Date}, \code{factor}, \code{integer}, \code{labelled}, \code{logical} and \code{numeric}), for instance variables of type \code{complex} or user-defined classes. It is possible to tell \code{makeDataReport()} how to handle such variables by use of the argument \code{treatXasY}. This argument takes a list where the names correspond to "new" variable types (\code{X}), while the entries must be supported variable types (\code{Y}). For instance, we can instruct \code{dataMaid} to treat complex variables as numerics and generate a data report for a type \code{complex} variable like this:
\begin{Schunk}
\begin{Sinput}
R> complexData <- data.frame(complexVar = complex(100, real = 1:100,
+ imaginary = 3), numericVar = 1:100)
R> makeDataReport(complexData, treatXasY=list(complex = "numeric"),
+ replace=TRUE)
\end{Sinput}
\end{Schunk}
In this report, we will find that the two variables, \code{complexVar} and \code{numericVar} will be have identical presentations in the variable list, as treating a \code{complex} variable as a \code{numeric} means dropping the imaginary part of the complex numbers which was the only thing setting the two variables apart in the first place.
\section[Using dataMaid interactively]{Using \pkg{dataMaid} interactively}
\label{sec:interactiveCleanR}
While overview documents are great for presenting and documenting the data at various stages
of the data cleaning process, it may be useful to be able to work more
interactively when performing actual data cleaning. Aside from
the \code{makeDataReport()} function presented above, \pkg{dataMaid} also
provides more standard \proglang{R} interactive tools, such as
functions that print results to the console or return the information
as an object for later use. This section describes how to use the
functions \code{check()}, \code{summarize()} and \code{visualize()} to
work interactively with \pkg{dataMaid}.
\subsection{Data cleaning by hand: An example}
Assume that we wish to look further into a certain variable from
\code{toyData}, namely \code{events}. The data cleaning summary found some
issues in this variable, and we would like to recall what these issues
were. This can be done using the \code{check()} command
\begin{Schunk}
\begin{Sinput}
R> check(toyData$events)
\end{Sinput}
\begin{Soutput}
$identifyMissing
The following suspected missing value codes enter as regular values: 999, NaN.
$identifyOutliers
Note that the following possible outlier values were detected: 82, 999.
\end{Soutput}
\end{Schunk}
Note that the arguments specifying which checks to perform, as
described in the previous section, are in fact passed to \code{check()},
and thus they can also be used here. For instance, if we only want to
check for potentially miscoded missing values, we can use the \code{checks} argument and the \code{setChecks()} helper function to specify this.
Recall that Table~\ref{table.SVCfunctions} or an \code{allCheckFunctions()} call provide
overviews of the available check functions.
Moving forward, we limit the numeric checks to only identify miscoded
missing values:
\begin{Schunk}
\begin{Sinput}
R> check(toyData$events, checks = setChecks(numeric = "identifyMissing"))
\end{Sinput}
\begin{Soutput}
$identifyMissing
The following suspected missing value codes enter as regular values: 999, NaN.
\end{Soutput}
\end{Schunk}
An equivalent way to call only a single, specific \code{checkFunction},
such as \code{identifyMissing}, is by using it directly on the variable,
e.g.,
\begin{Schunk}
\begin{Sinput}
R> identifyMissing(toyData$events)
\end{Sinput}
\begin{Soutput}
The following suspected missing value codes enter as regular values: 999, NaN.
\end{Soutput}
\end{Schunk}
The result of a \code{checkFunction} is an object of class
\code{checkResult}. By using the structure function, \code{str()}, we can
look further into its components:
\begin{Schunk}
\begin{Sinput}
R> missEvents <- identifyMissing(toyData$events)
R> str(missEvents)
\end{Sinput}
\begin{Soutput}
List of 3
$ problem : logi TRUE
$ message : chr "The following suspected missing value codes enter as regular values: \\\"999\\\", \\\"NaN\\\"."
$ problemValues: num [1:2] 999 NaN
- attr(*, "class")= chr "checkResult"
\end{Soutput}
\end{Schunk}
The most important thing to note here is that while the printed
message is made for easy reading, the actual values of the variable
causing the issue are still obtainable in the entry
\code{problemValues}. If we decide that the values \code{999}
and \code{NaN} in \code{events} are in fact miscoded missing values, we can
easily replace them with \code{NA}s:
\begin{Schunk}
\begin{Sinput}
R> toyData$events[toyData$events %in% missEvents$problemValues] <- NA
R> identifyMissing(toyData$events)
\end{Sinput}
\begin{Soutput}
No problems found.
\end{Soutput}
\end{Schunk}
Similarly, the \code{visualize()} and \code{summarize()} functions can be
used to run the corresponding visualizations and summaries for each
variable. See Figure~\ref{fig:example3} for the visualization output.
\begin{Schunk}
\begin{Sinput}
R> visualize(toyData$events)
R> summarize(toyData$events)
\end{Sinput}
\begin{Soutput}
$variableType
Variable type: numeric
$countMissing
Number of missing obs.: 4 (26.67 %)
$uniqueValues
Number of unique values: 6
$centralValue
Median: 4
$quartiles
1st and 3rd quartiles: 1.5; 6
$minMax
Min. and max.: 1; 82
\end{Soutput}
\end{Schunk}
\begin{figure}[tb]
\begin{center}
\includegraphics[width=7.5cm]{article_vol3-toydataevents.pdf}
\end{center}
\caption{Output from running \code{visualize()} on variable \code{events} from the
\code{toyData} dataset.}
\label{fig:example3}
\end{figure}
As we saw with the \code{check()} function, the summary can be modified
by using the \code{summaries} argument and the \code{setSummaries()} helper function. If we want to remove the default summaries \code{variableType} and \code{countMissing} for numeric variables, we can use the function \code{defaultNumericSummaries()} and its argument \code{remove} that excludes a vector of summaries from the usual default summaries:
\begin{Schunk}
\begin{Sinput}
R> summarize(toyData$events,
+ summaries = setSummaries(
+ numeric = defaultNumericSummaries(remove = c("variableType",
+ "countMissing"))))
\end{Sinput}
\begin{Soutput}
$uniqueValues
Number of unique values: 6
$centralValue
Median: 4
$quartiles
1st and 3rd quartiles: 1.5; 6
$minMax
Min. and max.: 1; 82
\end{Soutput}
\end{Schunk}
The syntax in this code chunk can be read as follows: "Summarize \code{events} in \code{toyData}, and for \code{numeric} variables, set the summaries to be the default summary functions, except \code{variableType} and \code{countMissing}."
Similar \code{defaultXXXSummaries()} functions are available for the other supported variable classes. For checks, the same syntax can also be used, but the helper functions are now named \code{defaultXXXChecks} with \code{XXX} as a placeholder for a supported variable class.
Note that the \code{summarize()}, \code{check()} and \code{visualize()} functions are also available interactively for full datasets by calling e.g., \code{summarize(toyData)}. However, this produces an extensive amount of output in the console, and therefore, we generally do not recommend it, unless working with very small datasets or subsets of datasets.
\section{A worked example: Dirty presidents}
\label{sec:bigExample}
\begin{figure}[tb]
\begin{center}
\frame{\includegraphics[width=7.5cm,page=1]{dataMaid_presidentData_appB.pdf}}
\frame{\includegraphics[width=7.5cm,page=2]{dataMaid_presidentData_appB.pdf}}\\
\end{center}
\caption{The front page and the first page of the data overview report for the \code{presidentData} dataset. Note that the report title has been customized (front page), \code{identifyLoners} has been removed from the checks performed on character variables ("Identify levels with <6 obs." is not checked for character variables in the table on page 1) and that variables of class \code{Name} have been set to be treated like \code{character} variables (page 1). Larger versions of the pages can be seen in
Appendix~\ref{sec:appendix2}.}
\label{fig:bigExampleP01}
\end{figure}
\begin{figure}[tb]
\begin{center}
\frame{\includegraphics[width=7.5cm,page=3]{dataMaid_presidentData_appB.pdf}}
\frame{\includegraphics[width=7.5cm,page=4]{dataMaid_presidentData_appB.pdf}}
\end{center}
\caption{The second and third pages of the \code{presidentData} data report. We see that there are two \code{Name} variables in the overview on page 2 and see that these variables are indeed treated as \code{character} variables on page 3, as specified in the \code{makeDataReport} call by use of the \code{treatXasY} argument. Larger versions of the pages can be seen in
Appendix~\ref{sec:appendix2}.}
\label{fig:bigExampleP23}