This repository has been archived by the owner on Aug 27, 2019. It is now read-only.
/
vizRguide.tex
1866 lines (1602 loc) · 67.1 KB
/
vizRguide.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% Time-stamp: <2015-04-24 11:56:50 chl>
%
% Here is a temptative textbook on R graphics with the lattice package.
%
\documentclass[a4paper,twoside]{book}
\let\newfloat\undefined
% Some packages I need: mdframed (colored frames with rounded corners)
% because using tikz would simply be... too much; graphicx for
% handling images (of course!); url (at some point, I'll need to
% decide whether to rely on href or not); and xspace to handle leading
% space after \R. The lipsum package should be removed when I don't
% need it anymore.
\usepackage[framemethod=1,skipbelow=\topskip,skipabove=\topskip]{mdframed}
\usepackage{graphicx}
\usepackage{url}
\usepackage{lipsum}
\usepackage{xspace}
% Override default headings and chapter style
\usepackage{fancyhdr}
\usepackage[Sonny]{fncychap}
\usepackage[eventxtindent=3pt,oddtxtexdent=3pt]{thumbs}
% Use margin note
\usepackage{marginnote}
% Hmisc Tables
\usepackage{ctable,setspace,relsize}
% To handle illustrated code chunks, I'm using a combination of the
% following two packages:
\usepackage{caption}
\usepackage{subfig}
\usepackage{floatrow}
% \makeatletter\@mparswitchfalse\makeatother
% \DeclareMarginSet{hangleft}%
% {\setfloatmargins
% {\hskip-\marginparwidth\hskip-\marginparsep}{\hfil}}
% \DeclareColorBox{framedfigure}{\fcolorbox{gray}{white}}
\floatsetup[figure]{facing=yes,capbesideposition={top,inside},capbesidesep=quad}
\floatsetup[widefloat]{margins=hangoutside,
%captionskip=3pt,
%framestyle=colorbox,framefit=yes,
%colorframeset=framedfigure,
%frameset={\fboxrule3pt\fboxsep8pt},
%framestyle=fbox,framearound=row,
%frameset={\fboxrule1pt\fboxsep10pt},
style=plaintop}
\captionsetup[widefloat]{justification=RaggedRight,name=Box,skip=15pt,
font={sf},labelfont={normalsize,sf,bf},
labelsep=newline,strut=no}
% and this is to avoid leaving too much blank between adjacent floats
\setlength{\floatsep}{1.25\baselineskip plus 3pt minus 2pt}
\setlength{\intextsep}{1.2ex}
% Some extra symbols (to mimic R's pch=0,1)
\usepackage{wasysym}
% Here is for the index
\usepackage{makeidx}
\usepackage{multicol}
\makeatletter
\renewenvironment{theindex}
{\if@twocolumn
\@restonecolfalse
\else
\@restonecoltrue
\fi
\setlength{\columnseprule}{0pt}
\setlength{\columnsep}{35pt}
\begin{multicols}{3}[\section*{\indexname}]
\markboth{\MakeUppercase\indexname}%
{\MakeUppercase\indexname}%
\thispagestyle{plain}\footnotesize
\setlength{\parindent}{0pt}
\setlength{\parskip}{0pt plus 0.3pt}
\relax
\let\item\@idxitem}%
{\end{multicols}\if@restonecol\onecolumn\else\clearpage\fi}
\makeatother
% I'm collecting pictures from a single PDF file generated by R.
% The macro takes one argument, although it should not be necessary
% because I'm using a single PDF file. However, I thought of extending
% this textbook to ggplot, in which case I would need to handle two
% PDF files of external figures.
\usepackage{pdfpages}
\newcounter{fig}
\setcounter{fig}{1}
\newcommand{\img}[1]{\includegraphics[page={\value{fig}},width=2in]{#1}\stepcounter{fig}}
\newcommand{\simg}[1]{\includegraphics[page={\value{fig}},width=1.5in]{#1}\stepcounter{fig}}
% Version control
\immediate\write18{sh ./vc}
\input{vc}
% Bibliography management
\usepackage[style=verbose-trad2,natbib=true]{biblatex}
\bibliography{refs}
\usepackage[hang,flushmargin]{footmisc}
% Setup for XeLaTeX
\usepackage{fontspec,xltxtra,xunicode}
\usepackage{unicode-math}
\defaultfontfeatures{Scale=MatchLowercase}
\setromanfont[Mapping=tex-text]{Minion Pro}
\setmathfont{Asana Math}
\setsansfont[Mapping=tex-text]{Myriad Pro}
\setmonofont[Scale=0.91]{Inconsolata}
\newfontfamily{\titlefont}[Scale=1.4]{Fertigo Pro}
% Some custom colors (not all needed)
\definecolor{grey30}{rgb}{.3,.3,.3}
\definecolor{grey70}{rgb}{.7,.7,.7}
\definecolor{mybrown}{rgb}{.5,.4,.3}
\definecolor{myred}{rgb}{.75,.19,.19}
\definecolor{myred2}{rgb}{.87,.38,.38}
\definecolor{myred3}{rgb}{.65,.38,.38}
\definecolor{myred4}{rgb}{.83,.64,.64}
\definecolor{shadecolor}{rgb}{.87,.93,.89}
\definecolor{cornflowerblue}{rgb}{.39,.58,.93}
\definecolor{seagreen}{rgb}{.18,.54,.34}
\definecolor{dandelion}{rgb}{.94,.88,.19}
\definecolor{peach}{rgb}{.996,.851,.723}
\definecolor{tan}{rgb}{.82,.70,.55}
\definecolor{periwinkle}{rgb}{.80,.80,1}
% Here is how I customized a listings environment for pretty printing
% R code. Keywords are highlighted and automatically appended to the
% index file.
\usepackage[formats]{listings}
\lstdefineformat{tilde}{\~=\textcolor{myred3}{$\mathtt{\sim}$}}
\lstloadlanguages{R}
\lstdefinelanguage{Renhanced}[]{R}{%
morekeywords={data.frame,within,as.numeric,cut,xyplot,histogram,%
groups,breaks,include.lowest,with,pch,panel.xyplot,%
replicate,alpha,colorRampPalette,jitter.x,amount,%
panel.rug,brewer.pal,rgb,jitter.y,describe,%
summary.formula,type,scales,xscale.components,%
yscale.components,xscale.components.log10.3,%
yscale.components.log10.3,box.ratio,limits,at,%
panel.bwplot,panel.points,cex,zoo,plot.points,%
ref,panel.histogram,panel.mathdensity,dmath,border,%
densityplot,nint,tck,alternating,boxplot.stats,%
na.rm,span,jitter.data,horizontal,draw,stripchart,%
aspect,seg.col,seg.lwd,xlab,ylab,varwidth,fill,bw,%
par.settings,simpleTheme,lwd,panel.xyarea,lag,%
panel.superpose,superpose,group.number,panel.groups,%
overlap,ts.union,layer,panel.tskernel,panel.lines,%
rollmean,number,strip,strip.left,filter,panel.xblocks,%
method,auto.key,columns,horizonplot,colorkey,qqmath,%
ecdfplot,dist,f.value,FUN,barchart,lty,panel.average,%
fun,col.regions,rot,scale,relevel,cut2,as.character,%
adj,panel.text,bystats,bystats2,summarize,summary.formula,%
colMeans,mApply,cut2,as.data.frame,matrix2dataFrame,%
impute,transcan,aregImpute,select},
deletekeywords={*,/,gaussian,count}}
\lstset{language=Renhanced,
format=tilde,
keepspaces=true,
%escapeinside={\%*}{*)},
extendedchars=true,
alsoletter={.},
upquote=true,
basicstyle=\small\ttfamily,
commentstyle=\color{myred4},
keywordstyle=\color{myred3},
showstringspaces=false,
index=[1][keywords],
indexstyle=\indexfoo}
\newcommand{\indexfoo}[1]{\index{#1@\textcolor{grey30}{\tt #1}}}
%\lstalias{Rcap}{Renhanced}
\makeatletter
\lst@AddToHook{TextStyle}{\let\lst@basicstyle\normalsize\ttfamily}
\makeatother
% I also redefine verbatim text to be handled by listings. This is
% weird because every verbatim is going to be treated as R code, but
% it is handy at some point. The rest is purely to avoid typing the
% same text repetitively.
\renewcommand{\texttt}[1]{\lstinline{#1}}
%\makeatletter
%\lst@AddToHook{Rcap}{\let\lst@basicstyle\small}
%\makeatother
\newcommand{\R}{\textsf{R}\xspace}
\newcommand{\mytilde}{\textcolor{myred3}{$\sim$}\xspace}
% R code chunk are embedded in a smooth color frame. (That's also
% weird because I cannot use lstlistings for any other purpose, whih
% is why I'll be using standard verbatim environment for R session or
% other stuff.)
\mdfsetup{innerbottommargin=0pt,leftmargin=0pt,rightmargin=0pt,%
backgroundcolor=shadecolor,linecolor=none,roundcorner=5}
\BeforeBeginEnvironment{lstlisting}{\begin{mdframed}\vskip-.5\baselineskip}
\AfterEndEnvironment{lstlisting}{\end{mdframed}}
% I'm using temporarily this example title page, roughly copy/pasted from the
% Memoir class package.
\definecolor{Medium}{gray}{.6}
\newlength{\tpheight}\setlength{\tpheight}{0.9\textheight}
\newlength{\txtheight}\setlength{\txtheight}{0.9\tpheight}
\renewcommand*{\marginfont}{\color{grey70}\sffamily\footnotesize}
% I don't want indentation for new paragraph, but just increase
% vertical separation between them.
\setlength{\parindent}{0pt}
\setlength{\parskip}{1ex plus 0.5ex minus 0.2ex}
% That's the magic command to tell LaTeX to start indexing the document.
\makeindex
%
% End of preamble.
%
\begin{document}
\pagenumbering{arabic}
\pagestyle{empty}
% This is merely a copy/paste from the memoir class, see T&H
% Typography.
% I should rework it when I have time, though I find it pretty nice
% actually...
\begingroup%
\raggedleft
\vspace*{\baselineskip}
{\Large\textsf{Christophe Lalanne}}\\[0.167\txtheight]
{\Large\titlefont A Visual Guide to}\\[\baselineskip]
{\textcolor{Medium}{\Huge\titlefont R Graphics and}}\\[\baselineskip]
{\textcolor{Medium}{\Huge\titlefont Data Munging}}\\[\baselineskip]
{\tiny\titlefont With 65 illustrations}\par
\vfill
{\includegraphics[scale=.75]{logo}\\{\small \tt \GITAbrHash}\\{\small %
\tt \VCDateTEX}}\par
\vspace*{3\baselineskip}
\endgroup
\pagestyle{empty}
\cleardoublepage
\frontmatter
\pagestyle{empty}
\vspace*{\fill}
\centerline{\includegraphics[width=.6\textwidth]{wcindex}}
\vspace*{3\baselineskip}
\hfill{\footnotesize \emph{The greatest value of a picture is when it forces us to notice what we
never expected to see}. --- John Tukey}
\hfill{\footnotesize \emph{At their best, graphics are instruments for reasoning}. --- Edward Tufte}
\vspace*{\fill}
\cleardoublepage
\pagenumbering{roman}
\tableofcontents
\mainmatter
\pagenumbering{arabic}
% --------------------------------------------------------------- Chapter 1 --
\chapter{Getting started with \R graphics}
\section{Why \R?}
In the ``GNU world'', most of the plotting program expect data from
text file (tab-delimited or csv) arranged by columns, with extra
grouping levels denoted by string or integer codes. This is the case
with \textsf{gnuplot}\autocite{janert09} (\url{http://www.gnuplot.info/}) or
\textsf{plotutils} (\url{http://www.gnu.org/software/plotutils/}), for
example.
Here is how we could create an histogram in gnuplot, from a series of
500 gaussian variates generate using \R as follows:
\begin{verbatim}
$ Rscript -e 'cat(rnorm(500), sep="\\n")' > rnd.dat
\end{verbatim}
Then, in gnuplot, we can run
\begin{verbatim}
bw=0.2
n=500
bin(x,width)=width*int(x/width)
tstr(n)=sprintf("Binwidth = %1.1f\n", n)
set xrange [-3:3]
set yrange [0:1]
set boxwidth bw
plot 'rnd.dat' using (bin($1,bw)):(1./(bw*n)) smooth frequency \
with boxes title tstr(bw)
\end{verbatim}
to get the picture shown below. (Note that we didn't try to customize
anything, except the title.)
\centerline{\includegraphics[width=.75\textwidth]{gnuplot}}
The above example shows one important aspect of using a dedicated
statistical package: \textsf{Gnuplot} has no function to draw an
histogram, and we have to write some code to perform additional tasks,
like binning in this case. The same applies for
\textsf{plotutils}. We could make use of external programs to do that,
like the \textsf{GSL} library (see example~22.11 from the manual,
\url{http://www.gnu.org/software/gsl/manual/}), but this two-stage
approach is rather likely to be cumbersome for repetitive tasks.
The author found that \textsf{Stata} is one of the only great
alternative to \R, but it has its cost. In fact, this textbook is
largely inspired from one of Stata Press book on \textsf{Stata}
graphical capabilities\autocite{mitchell08}.
\section{The \R graphical model}
\section{Base vs. {\tt lattice} graphics}
\R comes with several graphical system, including so-called ``Base''
graphics, which rely on \texttt{grDevices} and ??. While these plotting
functions are quite useful, they lack several interesting aspects that are
available through \texttt{grid}-based packages, including \texttt{lattice}
and \texttt{ggplot2}, and in particular the \texttt{update} and
\texttt{print} function.
\section{The grammar of graphics}
% --------------------------------------------------------------- Chapter 2 --
\chapter{Data management}
\addthumb{Data munging}{\Large\textsf{Tidying}}{white}{cornflowerblue}
\section{Structuring data}
In \R, the most common structure used to store a data set with
mixed-type variables is a \texttt{data.frame}. Such an \R object
presents several characteristics that makes it most appropriate for
managing statistical data structure, with few exceptions (e.g., when
one only has to work with aggregated data or two-way tables). It
should be noted that other data structures might be more appropriate,
for example when one is interested in time series analysis, but see
the \texttt{zoo} package\autocite{zeilis05}.
Many \R functions accept \texttt{data.frame} as input, and further
allow to subset or index it for computation or visualization
purpose. In addition to receiving a \texttt{data.frame}, some \R
commands allow to use a \emph{formula} notation, where the right and
left-hand side are separated by the \mytilde (tilde) operator. The
use of \index{formula} together with a \texttt{data.frame} simplify the
accession of variable in a given environment.
This is especially true when using the \texttt{lattice} package which
is entirely based on formula, even if this is not apparent at
first sight.
\section{Managing data}
Consider, for example, the \emph{low birth study} which is discussed
at length in Hosmer and Lemeshow's textbook on logistic regression
\autocite{hosmer89}. A quick look at the variables should make it
clear that they won't be treated the way we like them to be
considered: mother's ethnicity status (\texttt{race}) takes three
integer values, without any explicit meaning.
\begin{verbatim}
data(birthwt, package="MASS")
str(birthwt)
\end{verbatim}
\subsection{Variable recoding and annotation}
The \texttt{Hmisc} package includes numerous \R functions that will
facilitate the task of data checking (\texttt{describe} provides
``codebook'' facilities), summarizing (\texttt{summary.formula}) or
aggregating data, (\texttt{summary.formula}).
Here are some examples of use with the \texttt{birthwt} data. For
illustration purpose, we set some observations as missing on two variables
(\texttt{age} and \texttt{ftv}).
\begin{verbatim}
set.seed(101)
birthwt$age[5] <- NA
birthwt$ftv[sample(1:nrow(birthwt), 5)] <- NA
yesno <- c("No", "Yes")
birthwt <- within(birthwt, {
smoke <- factor(smoke, labels = yesno)
low <- factor(low, labels = yesno)
ht <- factor(ht, labels = yesno)
ui <- factor(ui, labels = yesno)
race <- factor(race, levels = 1:3, labels = c("White", "Black", "Other"))
lwt <- lwt/2.2 ## weight in kg
})
\end{verbatim}
It often helps to keep variable names short and informative, and to have
separated labels and/or units in case of continuous measurements. This is
available via the \texttt{label} and \texttt{units} functions. Note that
\texttt{label} allows to annotate the data frame as well.
\begin{verbatim}
library(Hmisc)
label(birthwt$age) <- "Mother age"
units(birthwt$age) <- "years"
label(birthwt$bwt) <- "Baby weight"
units(birthwt$bwt) <- "grams"
label(birthwt, self = TRUE) <- "Hosmer & Lemeshow's low birth weight study."
\end{verbatim}
These labels can then be used in almost every \texttt{Hmisc} functions, even
when using \texttt{list.tree} in place of \texttt{str}. However, we will see
that there mostly useful when generating Tables or Figures.
The \texttt{contents} command offers a quick summary of data format and
missing values, and it provides a list of labels associated to variables
treated as factor by R.
Once we have a working data frame, the functions \texttt{contents} and
\texttt{describe} provide two quick summary of the data.
\begin{verbatim}
> contents(birthwt)
Data frame:birthwt 189 observations and 10 variables Maximum # NAs:5
Labels Units Levels Class Storage NAs
low 2 integer 0
age Mother age years integer integer 1
lwt double 0
race 3 integer 0
smoke 2 integer 0
ptl integer 0
ht 2 integer 0
ui 2 integer 0
ftv integer 5
bwt Baby weight grams integer integer 0
+--------+-----------------+
|Variable|Levels |
+--------+-----------------+
| low |No,Yes |
| smoke | |
| ht | |
| ui | |
+--------+-----------------+
| race |White,Black,Other|
+--------+-----------------+
\end{verbatim}
As can be seen, \texttt{contents} displays storage mode and class of R
variables that are present in the data frame. The number of missing values
is also reported for each variable. The levels of each \texttt{factor}-type
variable is also printed at the end of the output.
\begin{verbatim}
> describe(subset(birthwt, select = c(low, age, race, ftv)), digits = 3)
4 Variables 189 Observations
--------------------------------------------------------------------------------
low
n missing unique
189 0 2
No (130, 69%), Yes (59, 31%)
--------------------------------------------------------------------------------
age : Mother age [years]
n missing unique Info Mean .05 .10 .25 .50 .75
188 1 24 1 23.3 16 17 19 23 26
.90 .95
31 32
lowest : 14 15 16 17 18, highest: 33 34 35 36 45
--------------------------------------------------------------------------------
race
n missing unique
189 0 3
White (96, 51%), Black (26, 14%), Other (67, 35%)
--------------------------------------------------------------------------------
ftv
n missing unique Info Mean
184 5 6 0.83 0.783
0 1 2 3 4 6
Frequency 98 45 30 7 3 1
% 53 24 16 4 2 1
--------------------------------------------------------------------------------
\end{verbatim}
The \texttt{describe} function gives more details, and, in particular,
provides useful summary statistics for each variable. Here, we only
considered four variables: a binary variable (\texttt{low}), a continuous
measure (\texttt{age}), a three-level factor (\texttt{race}), and a count
variable (\texttt{ftv}). The output will be different depending on the type
of variable, and for all but continuous measures (defined as variable taking
at least 10 distinct values) \texttt{describe} will display a table of
counts and frequencies.
\subsection{Variable transformation}
In case we would like to consider one of the above factors as a
numerical variable, we can now use \texttt{as.numeric} and \R will
take care of attributing the lowest integer score to the baseline
category. Of course, there might be occasion where we would like to
change that reference level; or, we might want to collapse two
discrete categories. Again, there are simple commands to do that, for
example:
\begin{verbatim}
birthwt$low <- relevel(birthwt$low, "Yes")
levels(birthwt$race)[2:3] <- "Black+Other"
\end{verbatim}
Another common task consists in transforming some predictors, either
for visualization purpose or when building an explantory or predictive
model. As a simple example, we can imagine centering some of the
predictors of interest, like \texttt{age}, in the above example. The
\texttt{within} or \texttt{transform} command can be used to append
the centered variable to the list of variables present in the
\texttt{data.frame}:
\begin{verbatim}
birthwt <- transform(birthwt, age.c=scale(age, scale=FALSE))
\end{verbatim}
Likewise, we may want to recode previous premature labours
(\texttt{ptl}) as yes/no and number of physician visits during the
first trimester (\texttt{ftv}) as one/more than one, like shown below
(we show two different syntax that basically perform the same task by
relying on \R indexing):
\begin{verbatim}
birthwt <- transform(birthwt, ptl.yn=factor(ptl > 0, labels=c("No","Yes")),
ftv.c=factor(ifelse(ftv < 2, "1", "2+")))
\end{verbatim}
\texttt{Hmisc} provides a replacement for R's \texttt{cut} function with
better default options (especially the infamous \texttt{include.lowest =
FALSE}) to discretize a continuous variable. The \texttt{cut2} function
has many useful options, including \texttt{g=} and \texttt{levels.mean=} to
return $g$ classes and report center of each class instead of class
intervals:
\begin{verbatim}
table(cut2(birthwt$age, g = 3, levels.mean = TRUE, digits = 3))
\end{verbatim}
If there is some reason to treat \texttt{ftv} as an ordered factor,
a command like
\begin{verbatim}
as.ordered(cut2(birthwt$ftv, c(0, 1, 2, 6)))
\end{verbatim}
might do the job.
\section{Indexing, subsetting, conditioning}
A lot of statistical operations that practictioners use to apply on a
given dataset are mostly variations around the idea of indexing or
subsetting. By comparison, SQL-like operations would be selection and
projection.
% FIXME: add more details here
The \texttt{subset} command offers a simple and elegant way to combine both
operations: for a given data frame, the \texttt{subset =} option is used to
filter rows according to logical expression or simple row indexes while the
\texttt{select =} option is used to return only selected variables based on
their index (e.g., column 1 and 3) or an unquoted name (e.g.,
\texttt{c(low,lwt)}).
The following instruction will print the age of hypertensive mothers whose
baby was underweight:
\begin{verbatim}
subset(birthwt, low == "Yes" & ht == "Yes", age)
\end{verbatim}
It should be noted that \texttt{subset} returns a data frame.
Some people might prefer to use their favorite SQL-like language, so
something like this would perfectly fit their needs:
\begin{verbatim}
library(sqldf)
sqldf("SELECT age FROM birthwt WHERE low = 0 AND ht = 1 LIMIT 3", row.names = TRUE)
\end{verbatim}
Unfortunately, this doesn't work with ``labelled'' objects, as typically
returned by \texttt{Hmisc}, although a solution is readily available at \url{http://stackoverflow.com/q/2394902/420055}.
% FIXME: should be moved in a dedicated subsection?
There are also a bunch of command dedicated to variables clustering,
analysis of missing patterns, or simple (\texttt{impute}) or multiple
(\texttt{aregImpute}, \texttt{transcan}) imputation methods. Here is how we
would impute missing values with the median in the case of a continuous
variable:
\begin{verbatim}
lwt <- birthwt$lwt
lwt[sample(length(lwt), 10)] <- NA
lwt.i <- impute(lwt)
summary(lwt.i)
\end{verbatim}
Missing observations will be marked with an asterisk when we print the whole
object in R. To use the mean instead of the median, we just have to add the
\texttt{fun = mean} option.
\section{Summarizing data}
Statisticians generally spend a great part of their time in data
cleansing, data transformation or re-expression\autocite{hoaglin83},
and data visualization.
\subsection{Base R functions}
Both the \texttt{by} and \texttt{tapply} function allow to apply a builtin
or custom function to help summarizing a numeric variable by a categorical
variable. Most of the time, these two functions are less handy than
\texttt{aggregate} since the latter offers a formula interface and returns a
data frame.
\begin{verbatim}
aggregate(bwt ~ ui + I(ftv > 1), data = birthwt, mean)
\end{verbatim}
There is, however, one caveat when using \texttt{aggregate}: even if you can pass a
custom function that returns multiple values that can be printed on screen,
the resulting data frame will still have only one column for its output. For
exemple, the dimensions of the data frame created by the following call to
\texttt{aggregate} is 4 by 3, even if it looks like there two separate
columns for \texttt{bwt} mean and SD.
\begin{verbatim}
f <- function(x) c(mean = mean(x), sd = sd(x))
aggregate(bwt ~ ui + I(ftv > 1), data = birthwt, f)
\end{verbatim}
\subsection{The \texttt{Hmisc} package}
There are three useful commands that provide summary statistics for a list
of variables. They implement the ``split-apply-combine''
strategy\autocite{wickham11} in the spirit of R's built-in functions (unlike
\texttt{plyr}).
The first one, \texttt{summarize}, can be seen as an equivalent to R's
\texttt{aggregate} command. Given a response variable and one or more
classification factors, it applies a specific function to all data chunk,
where each chunk is defined based on factor levels. The results are stored
in a matrix, which can easily be coerced to a data frame
using, e.g., \texttt{as.data.frame} or \texttt{Hmisc::matrix2dataFrame}.
Note that enhanced results can be printed on the console using the
\texttt{prn} command.
Here is a first example, using a dedicated function to print the mean and
standard deviation (SD) of a numerical variable:
\begin{verbatim}
f <- function(x, na.opts = TRUE) c(mean = mean(x, na.rm = na.opts), sd = sd(x,
na.rm = na.opts))
out <- with(birthwt, summarize(bwt, race, f))
\end{verbatim}
Here is the output produced by R:
\begin{verbatim}
Average baby weight by ethnicity out
race bwt sd
3 White 3103 727.9
1 Black 2720 638.7
2 Other 2805 722.2
\end{verbatim}
Contrary to \texttt{aggregate}, this command provides multiway data
structure in case we ask to compute more than one quantity. In this case,
the dimension of \texttt{out} is a 3 by 3 data frame.
Summarizing multivariate responses or predictors is also possible, with either \texttt{summarize} or \texttt{mApply}. Of course, any built-in functions, such as \texttt{colMeans} could be used in place of our custom summary command.
\begin{verbatim}
with(birthwt, summarize(bwt, llist(race, smoke), f))
\end{verbatim}
The second command, \texttt{bystats}, (or \texttt{bystats2} for two-way
tabular output) allows to describe with any custom or built-in function one
or multiple outcome by two explanatory variables, or even more. Sample size
and the number of missing values are also printed.
With the following instruction,
\begin{verbatim}
with(birthwt, bystats(cbind(bwt, lwt), smoke, race))
\end{verbatim}
we would get
\begin{verbatim}
Mean of cbind(bwt, lwt) by smoke
N bwt lwt
No White 44 3429 63.11
Yes White 52 2827 57.41
No Black 16 2854 67.93
Yes Black 10 2504 64.82
No Other 55 2816 54.16
Yes Other 12 2757 56.36
ALL 189 2945 59.01
\end{verbatim}
whereas
\begin{verbatim}
with(birthwt, bystats2(lwt, smoke, race))
\end{verbatim}
gives
\begin{verbatim}
Mean of lwt by smoke
+----+
|N |
|Mean|
+----+
+---+-----+-----+-----+-----+
|No | 44 | 16 | 55 |115 |
| |63.11|67.93|54.16|59.50|
+---+-----+-----+-----+-----+
|Yes| 52 | 10 | 12 | 74 |
| |57.41|64.82|56.36|58.24|
+---+-----+-----+-----+-----+
|ALL| 96 | 26 | 67 |189 |
| |60.02|66.73|54.55|59.01|
+---+-----+-----+-----+-----+
\end{verbatim}
The third and last command is \texttt{summary.formula}, which can be
abbreviated as \texttt{summary} as long as formula is used to describe
variables relationships. There are three main configurations
(\texttt{method=}): "response", where a numerical variable is summarized for
each level of one or more variables (numerical variables will be discretized
in 4 classes), as \texttt{summarize} does; "cross", to compute conditional
and marginal means of several response variables described by at most 3
explanatory variables (again, continuous predictors are represented as
quartiles); "reverse", to summarize univariate distribution of a set of
variables for each level of a classification variable (which appears on the
left-hand side of the formula). Variables are viewed as continuous as long
as they have more than 10 distinct values, but this can be changed by
setting, e.g., \texttt{continuous = 5}. When \texttt{method = "reverse"}, it
is possible to add \texttt{overall = TRUE}, \texttt{test = TRUE} to add
overall statistics and corresponding statistical tests of null effect
between the groups.
Here are some examples of use, with their outputs.
\begin{verbatim}
> summary(bwt ~ race + ht + lwt, data = birthwt)
Baby weight N=189
+-------+------------+---+----+
| | |N |bwt |
+-------+------------+---+----+
|race |White | 96|3103|
| |Black | 26|2720|
| |Other | 67|2805|
+-------+------------+---+----+
|ht |No |177|2972|
| |Yes | 12|2537|
+-------+------------+---+----+
|lwt |[36.4, 50.9)| 53|2656|
| |[50.9, 55.5)| 43|3059|
| |[55.5, 64.1)| 46|3075|
| |[64.1,113.6]| 47|3038|
+-------+------------+---+----+
|Overall| |189|2945|
+-------+------------+---+----+
\end{verbatim}
\begin{verbatim}
> summary(cbind(lwt, age) ~ race + bwt, data = birthwt, method = "cross")
mean by race, bwt
+-------+
|N |
|Missing|
|lwt |
|age |
+-------+
+-----+-----------+-----------+-----------+-----------+-----+
| race|[ 709,2424)|[2424,3005)|[3005,3544)|[3544,4990]| ALL |
+-----+-----------+-----------+-----------+-----------+-----+
|White| 19 | 23 | 20 | 33 | 95 |
| | 0 | 1 | 0 | 0 |1 |
| | 55.55 | 57.67 | 62.23 | 63.25 |60.14|
| | 22.74 | 24.78 | 24.50 | 24.91 |24.36|
+-----+-----------+-----------+-----------+-----------+-----+
|Black| 9 | 9 | 6 | 2 | 26 |
| | 0 | 0 | 0 | 0 |0 |
| | 65.10 | 59.70 | 70.83 | 93.41 |66.73|
| | 23.44 | 20.89 | 20.00 | 20.50 |21.54|
+-----+-----------+-----------+-----------+-----------+-----+
|Other| 20 | 16 | 19 | 12 | 67 |
| | 0 | 0 | 0 | 0 |0 |
| | 51.23 | 52.95 | 58.90 | 55.34 |54.55|
| | 22.20 | 22.69 | 22.26 | 22.50 |22.39|
+-----+-----------+-----------+-----------+-----------+-----+
|ALL | 48 | 48 | 45 | 47 |188 |
| | 0 | 1 | 0 | 0 |1 |
| | 55.54 | 56.48 | 61.97 | 62.51 |59.06|
| | 22.65 | 23.35 | 22.96 | 24.11 |23.27|
+-----+-----------+-----------+-----------+-----------+-----+
\end{verbatim}
\begin{verbatim}
> summary(low ~ race + ht, data = birthwt, fun = table)
low N=189
+-------+-----+---+---+---+
| | |N |No |Yes|
+-------+-----+---+---+---+
|race |White| 96| 73|23 |
| |Black| 26| 15|11 |
| |Other| 67| 42|25 |
+-------+-----+---+---+---+
|ht |No |177|125|52 |
| |Yes | 12| 5| 7 |
+-------+-----+---+---+---+
|Overall| |189|130|59 |
+-------+-----+---+---+---+
\end{verbatim}
Finally, here is a more complex Table produced by \texttt{summary.formula}
with \texttt{method = "reverse"}.
\begin{verbatim}
out <- summary(low ~ race + age + ui, data = birthwt, method = "reverse", overall = TRUE,
test = TRUE)
print(out, prmsd = TRUE, digits = 2)
\end{verbatim}
Although we display the output on the console directly, it is also possible
to build a $\LaTeX$ Table using the \texttt{latex} function.
\begin{verbatim}
latex(out, file = "tab-summary.tex", ctable = TRUE, digits = 1, exclude1 = FALSE, caption = NULL)
\end{verbatim}
Here is the final output
\input{tab-summary.tex}
\subsection{The \texttt{plyr} package}
The \texttt{plyr} package\autocite{wickham11} offers a general
solution to those kind of tasks.
% --------------------------------------------------------------- Chapter 3 --
\chapter{Univariate distributions}
\addthumb{Lattice graphics}{\Large\textsf{Lattice}}{white}{seagreen}
\lipsum[1]
\section{Stripchart}
Stripchart aims at showing the distribution of a series of continuous
measurements (much like scatterplot for 2D data discussed in
\S~\ref{sec:scatterplot}). They are useful for small to moderate
dataset. With large $N$ it is proably better to switch to alternative
displays, see next sections.
\begin{figure}[H]
\begin{lstlisting}
x <- sample(1:30, 25, replace=TRUE)
stripplot(~ x, jitter.data=TRUE, factor=.8, aspect=.5)
\end{lstlisting}
\fcapside[\FBwidth] {\img{figs_lattice-crop}}
{\caption*{With this synthetic dataset where several observations
can take the same value, jittering point locations on the
horizontal and vertical axes ensures a better representation.}}
\end{figure}
\begin{figure}[H]
\begin{lstlisting}
stripplot(~ x, jitter.data=TRUE, factor=.8, aspect="xy")
\end{lstlisting}
\fcapside[\FBwidth] {\img{figs_lattice-crop}}
{\caption*{A better way of flattening the display is to use an
``\texttt{xy}'' aspect.}}
\end{figure}
\begin{figure}[H]
\begin{lstlisting}
stripplot(x ~ 1, horizontal=FALSE, jitter.data=TRUE, aspect=1.2,
scales=list(x=list(draw=F)), xlab="", pch=15, alpha=.5)
\end{lstlisting}
\fcapside[\FBwidth] {\img{figs_lattice-crop}}
{\caption*{This is basically the same picture but the $x$ and $y$
axis have been transposed. We used a different symbol and
transparency to highlight where replication occurs in the
data. Obviously, that won't work so nicely with larger sample
size or a higher density of replication.}}
\end{figure}
\begin{figure}[H]
\begin{lstlisting}
stripplot(~ x, panel=HH::panel.dotplot.tb, cex=1.2, factor=.2)
\end{lstlisting}
\fcapside[\FBwidth] {\img{figs_lattice-crop}} {\caption*{In contrast
to the base \texttt{stripchart} function, there is no way of
imposing a stacked display in \texttt{lattice}. However, there
is some convenient panel function in the \texttt{HH} package.}}
\end{figure}
\begin{figure}[H]
\begin{lstlisting}
x <- sample(seq(1, 60, by=2), 75, replace=TRUE)
stripplot(1 ~ x, panel=panel.sunflowerplot, col="black",
seg.col="black", seg.lwd=1, size=.08)
\end{lstlisting}
\fcapside[\FBwidth] {\img{figs_lattice-crop}} {\caption*{With
possible replicates, it is also interesting to use
``sunflowers'' where multiple leaves are used for each
duplicate. The custom panel function mimics the base
\texttt{sunflowerplot} function. For an alternative way of
embedding ``sunflowers'' into a \texttt{lattice} display, see
the following thread on R-help: \url{http://bit.ly/Ig4RTq}. Note
that we remove $y$-axis annotation using commands presented
before (i.e., using \texttt{scales=}).}}
\end{figure}
\section{Histograms}
\lipsum[1]
\begin{figure}[H]
\begin{lstlisting}
histogram(~ waiting, data=faithful)
\end{lstlisting}
\fcapside[\FBwidth] {\img{figs_lattice-crop}}
{\caption*{\marginnote{\textbf{faithful}. Waiting time between
eruptions and the duration of the eruption for the Old
Faithful geyser in Yellowstone National Park, Wyoming, USA.}A
simple histogram of waiting time expressed as density. Note that
forgetting the \mytilde operator will raise an error message.}}
\end{figure}
Box~\ref{box:histogram} shows some custom settings with the
\texttt{faithful} dataset.
\lipsum[1]
\begin{figure*}[h]\fboxsep-.4pt
\ffigbox{}{\CommonHeightRow{\begin{subfloatrow}[4]
\ffigbox[\FBwidth]
{\simg{figs_lattice-crop}}{\caption*{\texttt{type="count"}}}
\ffigbox[\FBwidth]
{\simg{figs_lattice-crop}}{\caption*{\texttt{col="steelblue"}}}
\ffigbox[\FBwidth]
{\simg{figs_lattice-crop}}{\caption*{\texttt{border=NA,
nint=12}}}
\ffigbox[\FBwidth]
{\simg{figs_lattice-crop}}{\caption*{\texttt{nint=nrow(dataset)}}}
\end{subfloatrow}}
{\caption{Common options for \texttt{histogram} include displaying
counts instead of percents, or varying bar color. It is also
possible to change default bin size. When
\texttt{nint=nrow(dataset)}, we have a so-called high-density
vertical lines, much like when using \texttt{plot(...,
type="h")}.}\label{box:histogram}}}
\end{figure*}
\begin{figure}[H]
\begin{lstlisting}
p <- histogram(~ waiting, data=faithful, lwd=5, type="percent",
border="white")
x.breaks <- p$panel.args.common$breaks
y.values <- table(cut(faithful$waiting, breaks=x.breaks))
update(p, panel=function(...) {
panel.histogram(...)
panel.text(x.breaks+diff(x.breaks)[1]/2,
y.values/nrow(faithful)*100,
as.character(y.values), cex=.8, adj=c(.5,0))
})
\end{lstlisting}
\fcapside[\FBwidth] {\img{figs_lattice-crop}}
{\caption*{The following example demonstrates how a default
histogram displaying percent data can be annotated with counts
data. This is intended to show how we can steal away default
setting to display the distribution of discrete values.)}}
\end{figure}
\begin{figure}[H]
\begin{lstlisting}
x <- rnorm(80, mean=12, sd=2)
histogram(~ x, type="density", border=NA,
panel=function(x, ...) {
panel.histogram(x, ...)
panel.mathdensity(dmath=dnorm, col="#BF3030",
args=list(mean=mean(x),sd=sd(x)))
})
\end{lstlisting}
\fcapside[\FBwidth] {\img{figs_lattice-crop}}
{\caption*{An example where we superimposed a normal density with
parameters estimated from the sample.)}}
\end{figure}
\section{Density plots}
\lipsum[1]\autocite{silverman86}.
\begin{figure}[H]
\begin{lstlisting}
densityplot(~ waiting, data=faithful)
\end{lstlisting}
\fcapside[\FBwidth] {\img{figs_lattice-crop}} {\caption*{The default
panel relies on \R's \texttt{density} function. As such, the
default kernel is \texttt{gaussian} with $n=512$ equally spaced
at which the density is estimated. The choice of the bandwith
follows Silverman's rule of thumb, namely
$\min(0.9\textrm{SD},\textrm{IQR}/1.34n)$. An alternative
bandwith can be selected using \texttt{bw="SJ"}.}}
\end{figure}
\begin{figure}[H]