-
Notifications
You must be signed in to change notification settings - Fork 1
/
data.R
294 lines (281 loc) · 9.86 KB
/
data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
#' Acetylene Data
#'
#' The data consist of measures of \code{yield} of a chemical manufacturing
#' process for acetylene in relation to numeric parameters.
#'
#' Marquardt and Snee (1975) used these data to illustrate ridge regression in
#' a model containing quadratic and interaction terms, particularly the need to
#' center and standardize variables appearing in high-order terms.
#'
#' Typical models for these data include the interaction of \code{temp:ratio},
#' and a squared term in \code{temp}
#'
#' @name Acetylene
#' @docType data
#' @format A data frame with 16 observations on the following 4 variables.
#' \describe{
#' \item{\code{yield}}{conversion percentage yield of acetylene}
#' \item{\code{temp}}{reactor temperature (celsius)}
#' \item{\code{ratio}}{H2 to N-heptone ratio}
#' \item{\code{time}}{contact time (sec)}
#' }
#' @references
#' Marquardt, D.W., and Snee, R.D. (1975), "Ridge Regression in
#' Practice," \emph{The American Statistician}, \bold{29}, 3-20.
#'
#' Marquardt, D.W. (1980), "A Critique of Some Ridge Regression Methods:
#' Comment," \emph{Journal of the American Statistical Association}, Vol. 75,
#' No. 369 (Mar., 1980), pp. 87-91
#'
#' @source SAS documentation example for \code{PROC REG}, \emph{Ridge
#' Regression for Acetylene Data}.
#' @keywords datasets
#' @examples
#'
#' data(Acetylene)
#'
#' # naive model, not using centering
#' amod0 <- lm(yield ~ temp + ratio + time + I(time^2) + temp:time, data=Acetylene)
#'
#' y <- Acetylene[,"yield"]
#' X0 <- model.matrix(amod0)[,-1]
#'
#' lambda <- c(0, 0.0005, 0.001, 0.002, 0.005, 0.01)
#' aridge0 <- ridge(y, X0, lambda=lambda)
#'
#' traceplot(aridge0)
#' traceplot(aridge0, X="df")
#' pairs(aridge0, radius=0.2)
#'
#'
#'
NULL
#' Detroit Homicide Data for 1961-1973
#'
#' @description
#' The data set \code{Detroit} was used extensively in the book by Miller
#' (2002) on subset regression. The data are unusual in that a subset of three
#' predictors can be found which gives a very much better fit to the data than
#' the subsets found from the Efroymson stepwise algorithm, or from forward
#' selection or backward elimination. They are also unusual in that, as time
#' series data, the assumption of independence is patently violated, and the
#' data suffer from problems of high collinearity.
#'
#' As well, ridge regression reveals somewhat paradoxical paths of shrinkage in
#' univariate ridge trace plots, that are more comprehensible in multivariate
#' views.
#'
#' @details
#' The data were originally collected and discussed by Fisher (1976) but the
#' complete dataset first appeared in Gunst and Mason (1980, Appendix A).
#' Miller (2002) discusses this dataset throughout his book, but doesn't state
#' clearly which variables he used as predictors and which is the dependent
#' variable. (\code{Homicide} was the dependent variable, and the predictors
#' were \code{Police} \dots{} \code{WkEarn}.) The data were obtained from
#' StatLib.
#'
#' A similar version of this data set, with different variable names appears in
#' the \code{bestglm} package.
#'
#' @name Detroit
#' @docType data
#' @format A data frame with 13 observations on the following 14 variables.
#' \describe{
#' \item{\code{Police}}{Full-time police per 100,000 population}
#' \item{\code{Unemp}}{Percent unemployed in the population}
#' \item{\code{MfgWrk}}{Number of manufacturing workers in thousands}
#' \item{\code{GunLic}}{Number of handgun licences per 100,000 population}
#' \item{\code{GunReg}}{Number of handgun registrations per 100,000 population}
#' \item{\code{HClear}}{Percent of homicides cleared by arrests}
#' \item{\code{WhMale}}{Number of white males in the population}
#' \item{\code{NmfgWrk}}{Number of non-manufacturing workers in thousands}
#' \item{\code{GovWrk}}{Number of government workers in thousands}
#' \item{\code{HrEarn}}{Average hourly earnings}
#' \item{\code{WkEarn}}{Average weekly earnings}
#' \item{\code{Accident}}{Death rate in accidents per 100,000 population}
#' \item{\code{Assaults}}{Number of assaults per 100,000 population}
#' \item{\code{Homicide}}{Number of homicides per 100,000 of population}
#' }
#'
#' @references
#' Fisher, J.C. (1976). Homicide in Detroit: The Role of Firearms.
#' \emph{Criminology}, \bold{14}, 387--400.
#'
#' Gunst, R.F. and Mason, R.L. (1980). \emph{Regression analysis and its
#' application: A data-oriented approach}. Marcel Dekker.
#'
#' Miller, A. J. (2002). \emph{Subset Selection in Regression}. 2nd Ed. Chapman
#' & Hall/CRC. Boca Raton.
#'
#' @source \url{http://lib.stat.cmu.edu/datasets/detroit}
#' @keywords datasets
#' @examples
#'
#' data(Detroit)
#'
#' # Work with a subset of predictors, from Miller (2002, Table 3.14),
#' # the "best" 6 variable model
#' # Variables: Police, Unemp, GunLic, HClear, WhMale, WkEarn
#' # Scale these for comparison with other methods
#'
#' Det <- as.data.frame(scale(Detroit[,c(1,2,4,6,7,11)]))
#' Det <- cbind(Det, Homicide=Detroit[,"Homicide"])
#'
#' # use the formula interface; specify ridge constants in terms
#' # of equivalent degrees of freedom
#' dridge <- ridge(Homicide ~ ., data=Det, df=seq(6,4,-.5))
#'
#' # univariate trace plots are seemingly paradoxical in that
#' # some coefficients "shrink" *away* from 0
#' traceplot(dridge, X="df")
#' vif(dridge)
#' pairs(dridge, radius=0.5)
#'
#' \donttest{
#' plot3d(dridge, radius=0.5, labels=dridge$df)
#'
#' # transform to PCA/SVD space
#' dpridge <- pca(dridge)
#'
#' # not so paradoxical in PCA space
#' traceplot(dpridge, X="df")
#' biplot(dpridge, radius=0.5, labels=dpridge$df)
#'
#' # show PCA vectors in variable space
#' biplot(dridge, radius=0.5, labels=dridge$df)
#' }
#'
#'
NULL
#' Hospital manpower data
#'
#' @description
#' The hospital manpower data, taken from Myers (1990), table 3.8, are a
#' well-known example of highly collinear data to which ridge regression and
#' various shrinkage and selection methods are often applied.
#'
#' The data consist of measures taken at 17 U.S. Naval Hospitals and the goal
#' is to predict the required monthly man hours for staffing purposes.
#'
#' @details
#' Myers (1990) indicates his source was "Procedures and Analysis for Staffing
#' Standards Development: Data/Regression Analysis Handbook", Navy Manpower and
#' Material Analysis Center, San Diego, 1979.
#'
#' @name Manpower
#' @docType data
#' @format A data frame with 17 observations on the following 6 variables.
#' \describe{
#' \item{\code{Hours}}{monthly man hours (response variable)}
#' \item{\code{Load}}{average daily patient load}
#' \item{\code{Xray}}{monthly X-ray exposures}
#' \item{\code{BedDays}}{monthly occupied bed days}
#' \item{\code{AreaPop}}{eligible population in the area in thousands}
#' \item{\code{Stay}}{average length of patient's stay in days}
#' }
#' @seealso \code{\link[bestglm]{manpower}} for the same data, and other
#' analyses
#'
#' @references
#' Donald R. Jensen and Donald E. Ramirez (2012). Variations on
#' Ridge Traces in Regression, \emph{Communications in Statistics - Simulation
#' and Computation}, 41 (2), 265-278.
#'
#' @source
#' Raymond H. Myers (1990). \emph{Classical and Modern Regression with
#' Applications}, 2nd ed., PWS-Kent, pp. 130-133.
#'
#' @keywords datasets
#' @examples
#'
#' data(Manpower)
#' mmod <- lm(Hours ~ ., data=Manpower)
#' vif(mmod)
#' # ridge regression models, specified in terms of equivalent df
#' mridge <- ridge(Hours ~ ., data=Manpower, df=seq(5, 3.75, -.25))
#' vif(mridge)
#'
#' # univariate ridge trace plots
#' traceplot(mridge)
#' traceplot(mridge, X="df")
#'
#' # bivariate ridge trace plots
#' plot(mridge, radius=0.25, labels=mridge$df)
#' pairs(mridge, radius=0.25)
#'
#' \donttest{
#' # 3D views
#' # ellipsoids for Load, Xray & BedDays are nearly 2D
#' plot3d(mridge, radius=0.2, labels=mridge$df)
#' # variables in model selected by AIC & BIC
#' plot3d(mridge, variables=c(2,3,5), radius=0.2, labels=mridge$df)
#'
#' # plots in PCA/SVD space
#' mpridge <- pca(mridge)
#' traceplot(mpridge, X="df")
#' biplot(mpridge, radius=0.25)
#' }
#'
#'
NULL
#' Prostate Cancer Data
#'
#' @description
#' Data to examine the correlation between the level of prostate-specific
#' antigen and a number of clinical measures in men who were about to receive a
#' radical prostatectomy.
#'
#' @details
#' This data set came originally from the (now defunct) ElemStatLearn package.
#'
#' The last column indicates which 67 observations were used as the "training
#' set" and which 30 as the test set, as described on page 48 in the book.
#'
#' @name prostate
#' @docType data
#' @format A data frame with 97 observations on the following 10 variables.
#' \describe{
#' \item{lcavol}{log cancer volume}
#' \item{lweight}{log prostate weight}
#' \item{age}{in years}
#' \item{lbph}{log of the amount of benign prostatic hyperplasia}
#' \item{svi}{seminal vesicle invasion}
#' \item{lcp}{log of capsular penetration}
#' \item{gleason}{a numeric vector}
#' \item{pgg45}{percent of Gleason score 4 or 5}
#' \item{lpsa}{response}
#' \item{train}{a logical vector}
#' }
#' @note There was an error in this dataset in earlier versions of the package,
#' as indicated in a footnote on page 3 of the second edition of the book. As
#' of version 2012.04-0 this was corrected.
#'
#' @source
#' Stamey, T., Kabalin, J., McNeal, J., Johnstone, I., Freiha, F.,
#' Redwine, E. and Yang, N (1989) Prostate specific antigen in the diagnosis
#' and treatment of adenocarcinoma of the prostate II. Radical prostatectomy
#' treated patients, \emph{Journal of Urology}, \bold{16}: 1076--1083.
#' @keywords datasets
#' @examples
#'
#' data(prostate)
#' str( prostate )
#' cor( prostate[,1:8] )
#' prostate <- prostate[, -10]
#'
#' prostate.mod <- lm(lpsa ~ ., data=prostate)
#' vif(prostate.mod)
#'
#' py <- prostate[, "lpsa"]
#' pX <- data.matrix(prostate[, 1:8])
#' pridge <- ridge(py, pX, df=8:1)
#' pridge
#'
#' plot(pridge)
#' pairs(pridge)
#' traceplot(pridge)
#' traceplot(pridge, X="df")
#'
#'
#'
NULL