/
mgcv.r
executable file
·4853 lines (4361 loc) · 201 KB
/
mgcv.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
## R routines for the package mgcv (c) Simon Wood 2000-2016
## With contributions from Henric Nilsson
Rrank <- function(R,tol=.Machine$double.eps^.9) {
## Finds rank of upper triangular matrix R, by estimating condition
## number of upper rank by rank block, and reducing rank until this is
## acceptably low... assumes R pivoted
m <- nrow(R)
rank <- min(m,ncol(R))
ok <- TRUE
while (ok) {
Rcond <- .C(C_R_cond,R=as.double(R),r=as.integer(m),c=as.integer(rank),
work=as.double(rep(0,4*m)),Rcond=as.double(1))$Rcond
if (Rcond*tol<1) ok <- FALSE else rank <- rank - 1
}
rank
}
slanczos <- function(A,k=10,kl=-1,tol=.Machine$double.eps^.5,nt=1) {
## Computes truncated eigen decomposition of symmetric A by
## Lanczos iteration. If kl < 0 then k largest magnitude
## eigenvalues returned, otherwise k highest and kl lowest.
## Eigenvectors are always returned too.
## set.seed(1);n <- 1000;A <- matrix(runif(n*n),n,n);A <- A+t(A);er <- slanczos(A,10)
## um <- eigen(A,symmetric=TRUE);ind <- c(1:5,(n-5+1):n)
## range(er$values-um$values[ind]);range(abs(er$vectors)-abs(um$vectors[,ind]))
## It seems that when k (or k+kl) is beyond 10-15% of n
## then you might as well use eigen(A,symmetric=TRUE), but the
## extra cost is the expensive accumulation of eigenvectors.
## Should re-write whole thing using LAPACK routines for eigenvectors.
if (tol<=0||tol>.01) stop("silly tolerance supplied")
k <- round(k);kl <- round(kl)
if (k<0) stop("argument k must be positive.")
m <- k + max(0,kl)
n <- nrow(A)
if (m<1) return(list(values=rep(0,0),vectors=matrix(0,n,0),iter=0))
if (n != ncol(A)) stop("A not square")
if (m>n) stop("Can not have more eigenvalues than nrow(A)")
oo <- .C(C_Rlanczos,A=as.double(A),U=as.double(rep(0,n*m)),D=as.double(rep(0,m)),
n=as.integer(n),m=as.integer(k),ml=as.integer(kl),tol=as.double(tol),nt=as.integer(nt))
list(values = oo$D,vectors = matrix(oo$U,n,m),iter=oo$n)
}
rig <- function(n,mean,scale) {
## inverse gaussian deviates generated by algorithm 5.7 of
## Gentle, 2003. scale = 1/lambda.
if (length(n)>1) n <- length(n)
x <- y <- rnorm(n)^2
mys <- mean*scale*y
mu <- 0*y + mean ## y is there to ensure mu is a vector
mu2 <- mu^2;
ind <- mys < .Machine$double.eps^-.5 ## cut off for tail computation
x[ind] <- mu[ind]*(1 + 0.5*(mys[ind] - sqrt(mys[ind]*4+mys[ind]^2)))
x[!ind] <- mu[!ind]/mys[!ind] ## tail term (derived from Taylor of sqrt(1+eps) etc)
#my <- mean*y; sc <- 0*y + scale
#ind <- my > 1 ## cancellation error can be severe, without splitting
#x[!ind] <- mu[!ind]*(1 + 0.5*sc[!ind]*(my[!ind] - sqrt(4*my[!ind]/sc[!ind] + my[!ind]^2)))
## really the sqrt in the next term should be expanded beyond first order and then
## worked on - otherwise too many exact zeros?
#x[ind] <- pmax(0,mu[ind]*(1+my[ind]*.5*sc[ind]*(1-sqrt(1+ 4/(sc[ind]*my[ind])))))
ind <- runif(n) > mean/(mean+x)
x[ind] <- mu2[ind]/x[ind]
x ## E(x) = mean; var(x) = scale*mean^3
}
strip.offset <- function(x)
# sole purpose is to take a model frame and rename any "offset(a.name)"
# columns "a.name"
{ na <- names(x)
for (i in 1:length(na)) {
if (substr(na[i],1,7)=="offset(")
na[i] <- substr(na[i],8,nchar(na[i])-1)
}
names(x) <- na
x
}
pcls <- function(M)
# Function to perform penalized constrained least squares.
# Problem to be solved is:
#
# minimise ||W^0.5 (y - Xp)||^2 + p'Bp
# subject to Ain p >= b & C p = "constant"
#
# where B = \sum_{i=1}^m \theta_i S_i and W=diag(w)
# on entry this routine requires a list M, with the following elements:
# M$X - the design matrix for this problem.
# M$p - a feasible initial parameter vector - note that this should not be set up to
# lie exactly on all the inequality constraints - which can easily happen if M$p=0!
# M$y - response variable
# M$w - weight vector: W= diag(M$w)
# M$Ain - matrix of inequality constraints
# M$bin - b above
# M$C - fixed constraint matrix
# M$S - List of (minimal) penalty matrices
# M$off - used for unpacking M$S
# M$sp - array of theta_i's
# Ain, bin and p are not in the object needed to call mgcv....
#
{ nar<-c(length(M$y),length(M$p),dim(M$Ain)[1],dim(M$C)[1])
## sanity checking ...
if (nrow(M$X)!=nar[1]) stop("nrow(M$X) != length(M$y)")
if (ncol(M$X)!=nar[2]) stop("ncol(M$X) != length(M$p)")
if (length(M$w)!=nar[1]) stop("length(M$w) != length(M$y)")
if (nar[3]!=length(M$bin)) stop("nrow(M$Ain) != length(M$bin)")
if (nrow(M$Ain)>0) {
if (ncol(M$Ain)!=nar[2]) stop("nrow(M$Ain) != length(M$p)")
res <- as.numeric(M$Ain%*%M$p) - as.numeric(M$bin)
if (sum(res<0)>0) stop("initial parameters not feasible")
res <- abs(res)
if (sum(res<.Machine$double.eps^.5)>0)
warning("initial point very close to some inequality constraints")
res <- mean(res)
if (res<.Machine$double.eps^.5)
warning("initial parameters very close to inequality constraints")
}
if (nrow(M$C)>0) if (ncol(M$C)!=nar[2]) stop("ncol(M$C) != length(M$p)")
if (length(M$S)!=length(M$off)) stop("M$S and M$off have different lengths")
if (length(M$S)!=length(M$sp)) stop("M$sp has different length to M$S and M$off")
# pack the S array for mgcv call
m<-length(M$S)
Sa<-array(0,0);df<-0
if (m>0) for (i in 1:m)
{ Sa<-c(Sa,M$S[[i]])
df[i]<-nrow(M$S[[i]])
if (M$off[i]+df[i]-1>nar[2]) stop(gettextf("M$S[%d] is too large given M$off[%d]", i, i))
}
qra.exist <- FALSE
if (ncol(M$X)>nrow(M$X)) {
if (m>0) stop("Penalized model matrix must have no more columns than rows")
else { ## absorb M$C constraints
qra <- qr(t(M$C))
j <- nrow(M$C);k <- ncol(M$X)
M$X <- t(qr.qty(qra,t(M$X))[(j+1):k,])
M$Ain <- t(qr.qty(qra,t(M$Ain))[(j+1):k,])
M$C <- matrix(0,0,0)
M$p <- rep(0,ncol(M$X))
nar[2] <- length(M$p)
qra.exist <- TRUE
if (ncol(M$X)>nrow(M$X)) stop("Model matrix not full column rank")
}
}
o<-.C(C_RPCLS,as.double(M$X),as.double(M$p),as.double(M$y),as.double(M$w),as.double(M$Ain),as.double(M$bin)
,as.double(M$C),as.double(Sa),as.integer(M$off),as.integer(df),as.double(M$sp),
as.integer(length(M$off)),as.integer(nar))
p <- array(o[[2]],length(M$p))
if (qra.exist) p <- qr.qy(qra,c(rep(0,j),p))
p
} ## pcls
all_vars1 <- function(form) {
## version of all.vars that doesn't split up terms like x$y into x and y
vars <- all.vars(form)
vn <- all.names(form)
vn <- vn[vn%in%c(vars,"$","[[")] ## actual variable related names
if ("[["%in%vn) stop("can't handle [[ in formula")
ii <- which(vn%in%"$") ## index of '$'
if (length(ii)) { ## assemble variable names
vn1 <- if (ii[1]>1) vn[1:(ii[1]-1)]
go <- TRUE
k <- 1
while (go) {
n <- 2;
while(k<length(ii) && ii[k]==ii[k+1]-1) { k <- k + 1;n <- n + 1 }
vn1 <- c(vn1,paste(vn[ii[k]+1:n],collapse="$"))
if (k==length(ii)) {
go <- FALSE
ind <- if (ii[k]+n<length(vn)) (ii[k]+n+1):length(vn) else rep(0,0)
} else {
k <- k + 1
ind <- if (ii[k-1]+n<ii[k]-1) (ii[k-1]+n+1):(ii[k]-1) else rep(0,0)
}
vn1 <- c(vn1,vn[ind])
}
} else vn1 <- vn
vn1
} ## all_vars1
interpret.gam0 <- function(gf,textra=NULL,extra.special=NULL)
# interprets a gam formula of the generic form:
# y~x0+x1+x3*x4 + s(x5)+ s(x6,x7) ....
# and returns:
# 1. a model formula for the parametric part: pf (and pfok indicating whether it has terms)
# 2. a list of descriptors for the smooths: smooth.spec
# this is function does the work, and is called by in interpret.gam
# 'textra' is optional text to add to term labels
# 'extra.special' is label of extra smooth within formula.
{ p.env <- environment(gf) # environment of formula
tf <- terms.formula(gf,specials=c("s","te","ti","t2",extra.special)) # specials attribute indicates which terms are smooth
terms <- attr(tf,"term.labels") # labels of the model terms
nt <- length(terms) # how many terms?
if (attr(tf,"response") > 0) { # start the replacement formulae
response <- as.character(attr(tf,"variables")[2])
} else {
response <- NULL
}
sp <- attr(tf,"specials")$s # array of indices of smooth terms
tp <- attr(tf,"specials")$te # indices of tensor product terms
tip <- attr(tf,"specials")$ti # indices of tensor product pure interaction terms
t2p <- attr(tf,"specials")$t2 # indices of type 2 tensor product terms
zp <- if (is.null(extra.special)) NULL else attr(tf,"specials")[[extra.special]]
off <- attr(tf,"offset") # location of offset in formula
## have to translate sp, tp, tip, t2p (zp) so that they relate to terms,
## rather than elements of the formula...
vtab <- attr(tf,"factors") # cross tabulation of vars to terms
if (length(sp)>0) for (i in 1:length(sp)) {
ind <- (1:nt)[as.logical(vtab[sp[i],])]
sp[i] <- ind # the term that smooth relates to
}
if (length(tp)>0) for (i in 1:length(tp)) {
ind <- (1:nt)[as.logical(vtab[tp[i],])]
tp[i] <- ind # the term that smooth relates to
}
if (length(tip)>0) for (i in 1:length(tip)) {
ind <- (1:nt)[as.logical(vtab[tip[i],])]
tip[i] <- ind # the term that smooth relates to
}
if (length(t2p)>0) for (i in 1:length(t2p)) {
ind <- (1:nt)[as.logical(vtab[t2p[i],])]
t2p[i] <- ind # the term that smooth relates to
}
if (length(zp)>0) for (i in 1:length(zp)) {
ind <- (1:nt)[as.logical(vtab[zp[i],])]
zp[i] <- ind # the term that smooth relates to
} ## re-referencing is complete
k <- kt <- kti <- kt2 <- ks <- kz <- kp <- 1 # counters for terms in the 2 formulae
len.sp <- length(sp)
len.tp <- length(tp)
len.tip <- length(tip)
len.t2p <- length(t2p)
len.zp <- length(zp)
ns <- len.sp + len.tp + len.tip + len.t2p + len.zp# number of smooths
pav <- av <- rep("",0)
smooth.spec <- list()
#mgcvat <- "package:mgcv" %in% search() ## is mgcv in search path?
mgcvns <- loadNamespace('mgcv')
if (nt) for (i in 1:nt) { # work through all terms
if (k <= ns&&((ks<=len.sp&&sp[ks]==i)||(kt<=len.tp&&tp[kt]==i)||(kz<=len.zp&&zp[kz]==i)||
(kti<=len.tip&&tip[kti]==i)||(kt2<=len.t2p&&t2p[kt2]==i))) { # it's a smooth
## have to evaluate in the environment of the formula or you can't find variables
## supplied as smooth arguments, e.g. k <- 5;gam(y~s(x,k=k)), fails,
## but if you don't specify namespace of mgcv then stuff like
## loadNamespace('mgcv'); k <- 10; mgcv::interpret.gam(y~s(x,k=k)) fails (can't find s)
## eval(parse(text=terms[i]),envir=p.env,enclos=loadNamespace('mgcv')) fails??
## following may supply namespace of mgcv explicitly if not on search path...
## If 's' etc are masked then we can fail even if mgcv on search path, hence paste
## of explicit mgcv reference into first attempt...
st <- try(eval(parse(text=paste("mgcv::",terms[i],sep="")),envir=p.env),silent=TRUE)
if (inherits(st,"try-error")) st <-
eval(parse(text=terms[i]),enclos=p.env,envir=mgcvns)
if (!is.null(textra)) { ## modify the labels on smooths with textra
pos <- regexpr("(",st$lab,fixed=TRUE)[1]
st$label <- paste(substr(st$label,start=1,stop=pos-1),textra,
substr(st$label,start=pos,stop=nchar(st$label)),sep="")
}
smooth.spec[[k]] <- smooth.info(st) ## smooth.info supplies any extra specification info for class
if (ks<=len.sp&&sp[ks]==i) ks <- ks + 1 else # counts s() terms
if (kt<=len.tp&&tp[kt]==i) kt <- kt + 1 else # counts te() terms
if (kti<=len.tip&&tip[kti]==i) kti <- kti + 1 else # counts ti() terms
if (kt2<=len.t2p&&t2p[kt2]==i) kt2 <- kt2 + 1 # counts t2() terms
else kz <- kz + 1
k <- k + 1 # counts smooth terms
} else { # parametric
av[kp] <- terms[i] ## element kp on rhs of parametric
kp <- kp+1 # counts parametric terms
}
}
if (!is.null(off)) { ## deal with offset
av[kp] <- as.character(attr(tf,"variables")[1+off])
kp <- kp+1
}
pf <- paste(response,"~",paste(av,collapse=" + "))
if (attr(tf,"intercept")==0) {
pf <- paste(pf,"-1",sep="")
if (kp>1) pfok <- 1 else pfok <- 0
} else {
pfok <- 1;if (kp==1) {
pf <- paste(pf,"1");
}
}
fake.formula <- pf
if (length(smooth.spec)>0)
for (i in 1:length(smooth.spec)) {
nt <- length(smooth.spec[[i]]$term)
ff1 <- paste(smooth.spec[[i]]$term[1:nt],collapse="+")
fake.formula <- paste(fake.formula,"+",ff1)
if (smooth.spec[[i]]$by!="NA") {
fake.formula <- paste(fake.formula,"+",smooth.spec[[i]]$by)
av <- c(av,smooth.spec[[i]]$term,smooth.spec[[i]]$by)
} else av <- c(av,smooth.spec[[i]]$term)
}
fake.formula <- as.formula(fake.formula,p.env)
if (length(av)) {
pred.formula <- as.formula(paste("~",paste(av,collapse="+")))
pav <- all.vars(pred.formula) ## trick to strip out 'offset(x)' etc...
pred.formula <- reformulate(pav,env=p.env)
} else pred.formula <- ~1
ret <- list(pf=as.formula(pf,p.env),pfok=pfok,smooth.spec=smooth.spec,
fake.formula=fake.formula,response=response,fake.names=av,
pred.names=pav,pred.formula=pred.formula)
#environment(ret$fake.formula) <- environment(ret$pred.formula) <- p.env
class(ret) <- "split.gam.formula"
ret
} ## interpret.gam0
interpret.gam <- function(gf,extra.special=NULL) {
## wrapper to allow gf to be a list of formulae or
## a single formula. This facilitates general penalized
## likelihood models in which several linear predictors
## may be involved...
##
## The list syntax is as follows. The first formula must have a response on
## the lhs, rather than labels. For m linear predictors, there
## must be m 'base formulae' in linear predictor order. lhs labels will
## be ignored in a base formula. Empty base formulae have '-1' on rhs.
## Further formulae have labels up to m labels 1,...,m on the lhs, in a
## syntax like this: 3 + 5 ~ s(x), which indicates that the same s(x)
## should be added to both linear predictors 3 and 5.
## e.g. A bivariate normal model with common expected values might be
## list(y1~-1,y2~-1,1+2~s(x)), whereas if the second component was contaminated
## by something else we might have list(y1~-1,y2~s(v)-1,1+2~s(x))
##
## For a list argument, this routine returns a list of split.formula objects
## with an extra field "lpi" indicating the linear predictors to which each
## contributes...
if (is.list(gf)) {
d <- length(gf)
p.env <- environment(gf[[1]])
## make sure all formulae have a response, to avoid
## problems with parametric sub formulae of the form ~1
#if (length(gf[[1]])<3) stop("first formula must specify a response")
resp <- gf[[1]][2]
ret <- list()
pav <- av <- rep("",0)
nlp <- 0 ## count number of linear predictors (may be different from number of formulae)
for (i in 1:d) {
textra <- if (i==1) NULL else paste(".",i-1,sep="") ## modify smooth labels to identify to predictor
lpi <- getNumericResponse(gf[[i]]) ## get linear predictors to which this applies, if explicit
if (length(lpi)==1) warning("single linear predictor indices are ignored")
if (length(lpi)>0) gf[[i]][[2]] <- NULL else { ## delete l.p. labels from formula response
nlp <- nlp + 1;lpi <- nlp ## this is base formula for l.p. number nlp
}
ret[[i]] <- interpret.gam0(gf[[i]],textra,extra.special=extra.special)
ret[[i]]$lpi <- lpi ## record of the linear predictors to which this applies
## make sure all parametric formulae have a response, to avoid
## problems with parametric sub formulae of the form ~1
respi <- rep("",0) ## no extra response terms
if (length(ret[[i]]$pf)==2) {
ret[[i]]$pf[3] <- ret[[i]]$pf[2];ret[[i]]$pf[2] <- resp
respi <- rep("",0)
} else if (i>1) respi <- ret[[i]]$response ## extra response terms
av <- c(av,ret[[i]]$fake.names,respi) ## accumulate all required variable names
pav <- c(pav,ret[[i]]$pred.names) ## predictors only
}
av <- unique(av) ## strip out duplicate variable names
pav <- unique(pav)
if (length(av)>0) {
## work around - reformulate with response = "log(x)" will treat log(x) as a name,
## not the call it should be...
fff <- formula(paste(ret[[1]]$response,"~ ."))
ret$fake.formula <- reformulate(av,response=ret[[1]]$response,env=p.env)
ret$fake.formula[[2]] <- fff[[2]] ## fix messed up response
} else ret$fake.formula <- ret[[1]]$fake.formula ## create fake formula containing all variables
ret$pred.formula <- if (length(pav)>0) reformulate(pav) else ~1 ## predictor only formula
environment(ret$pred.formula) <- p.env
ret$response <- ret[[1]]$response
ret$nlp <- nlp ## number of linear predictors
for (i in 1:d) if (max(ret[[i]]$lpi)>nlp||min(ret[[i]]$lpi)<1) stop("linear predictor labels out of range")
class(ret) <- "split.gam.formula"
return(ret)
} else interpret.gam0(gf,extra.special=extra.special)
} ## interpret.gam
fixDependence <- function(X1,X2,tol=.Machine$double.eps^.5,rank.def=0,strict=FALSE)
# model matrix X2 may be linearly dependent on X1. This
# routine finds which columns of X2 should be zeroed to
# fix this. If rank.def>0 then it is taken as the known degree
# of dependence of X2 on X1 and tol is ignored.
{ qr1 <- qr(X1,LAPACK=TRUE)
R11 <- abs(qr.R(qr1)[1,1])
r<-ncol(X1);n<-nrow(X1)
if (strict) { ## only delete columns of X2 individually dependent on X1
## Project columns of X2 into space of X1 and look at difference
## to orignal X2 to check for deficiency...
QtX2 <- qr.qty(qr1,X2)
QtX2[-(1:r),] <- 0
mdiff <- colMeans(abs(X2 - qr.qy(qr1,QtX2)))
if (rank.def>0) ind <- (1:ncol(X2))[rank(mdiff) <= rank.def] else
ind <- (1:ncol(X2))[mdiff < R11*tol]
if (length(ind)<1) ind <- NULL
} else { ## make X2 full rank given X1
QtX2 <- qr.qty(qr1,X2)[(r+1):n,] # Q'X2
qr2 <- qr(QtX2,LAPACK=TRUE)
R <- qr.R(qr2)
# now final diagonal block of R may be zero, indicating rank
# deficiency.
r0 <- r <- nrow(R)
if (rank.def > 0 && rank.def <= nrow(R)) r0 <- r - rank.def else ## degree of rank def known
while (r0>0 && mean(abs(R[r0:r,r0:r]))< R11*tol) r0 <- r0 -1 ## compute rank def
r0 <- r0 + 1
if (r0>r) return(NULL) else
ind <- qr2$pivot[r0:r] # the columns of X2 to zero in order to get independence
}
ind
} ## fixDependence
augment.smX <- function(sm,nobs,np) {
## augments a smooth model matrix with a square root penalty matrix for
## identifiability constraint purposes.
ns <- length(sm$S) ## number of penalty matrices
if (ns==0) { ## nothing to do
return(rbind(sm$X,matrix(0,np,ncol(sm$X))))
}
ind <- colMeans(abs(sm$S[[1]]))!=0
sqrmaX <- mean(abs(sm$X[,ind]))^2
alpha <- sqrmaX/mean(abs(sm$S[[1]][ind,ind]))
St <- sm$S[[1]]*alpha
if (ns>1) for (i in 2:ns) {
ind <- colMeans(abs(sm$S[[i]]))!=0
alpha <- sqrmaX/mean(abs(sm$S[[i]][ind,ind]))
St <- St + sm$S[[i]]*alpha
}
rS <- mroot(St,rank=ncol(St)) ## get sqrt of penalty
X <- rbind(sm$X,matrix(0,np,ncol(sm$X))) ## create augmented model matrix
X[nobs+sm$p.ind,] <- t(rS) ## add in
X ## scaled augmented model matrix
} ## augment.smX
gam.side <- function(sm,Xp,tol=.Machine$double.eps^.5,with.pen=FALSE)
# works through a list of smooths, sm, aiming to identify nested or partially
# nested terms, and impose identifiability constraints on them.
# Xp is the parametric model matrix. It is needed in order to check whether
# there is a constant (or equivalent) in the model. If there is, then this needs
# to be included when working out side constraints, otherwise dependencies can be
# missed.
# Note that with.pen is quite extreme, since you then pretty much only pick
# up dependencies in the null spaces
{ if (!with.pen) { ## check that's possible and reset if not!
with.pen <- nrow(Xp) < ncol(Xp) + sum(unlist(lapply(sm,function(x) ncol(x$X))))
}
m <- length(sm)
if (m==0) return(sm)
v.names<-array("",0);maxDim<-1
for (i in 1:m) { ## collect all term names and max smooth `dim'
vn <- sm[[i]]$term
## need to include by variables in names
if (sm[[i]]$by!="NA") vn <- paste(vn,sm[[i]]$by,sep="")
## need to distinguish levels of factor by variables...
if (!is.null(sm[[i]]$by.level)) vn <- paste(vn,sm[[i]]$by.level,sep="")
sm[[i]]$vn <- vn ## use this record to identify variables from now
v.names <- c(v.names,vn)
if (sm[[i]]$dim > maxDim) maxDim <- sm[[i]]$dim
}
lv <- length(v.names)
v.names <- unique(v.names)
if (lv == length(v.names)) return(sm) ## no repeats => no nesting
## Only get this far if there is nesting.
## Need to test for intercept or equivalent in Xp
intercept <- FALSE
if (ncol(Xp)) {
## first check columns directly...
if (sum(apply(Xp,2,sd)<.Machine$double.eps^.75)>0) intercept <- TRUE else {
## no constant column, so need to check span of Xp...
f <- rep(1,nrow(Xp))
ff <- qr.fitted(qr(Xp),f)
if (max(abs(ff-f))<.Machine$double.eps^.75) intercept <- TRUE
}
}
sm.id <- as.list(v.names)
names(sm.id) <- v.names
for (i in 1:length(sm.id)) sm.id[[i]]<-array(0,0)
sm.dim <- sm.id
for (d in 1:maxDim) {
for (i in 1:m) {
if (sm[[i]]$dim==d&&sm[[i]]$side.constrain) for (j in 1:d) { ## work through terms
term<-sm[[i]]$vn[j]
a <- sm.id[[term]]
la <- length(a)+1
sm.id[[term]][la] <- i ## record smooth i.d. for this variable
sm.dim[[term]][la] <- d ## ... and smooth dim.
}
}
}
## so now each unique variable name has an associated array of
## the smooths of which it is an argument, arranged in ascending
## order of dimension. Smooths for which side.constrain==FALSE are excluded.
if (maxDim==1) warning("model has repeated 1-d smooths of same variable.")
## Now set things up to enable term specific model matrices to be
## augmented with square root penalties, on the fly...
if (with.pen) {
k <- 1
for (i in 1:m) { ## create parameter indices for each term
k1 <- k + ncol(sm[[i]]$X) - 1
sm[[i]]$p.ind <- k:k1
k <- k1 + 1
}
np <- k-1 ## number of penalized parameters
}
nobs <- nrow(sm[[1]]$X) ## number of observations
for (d in 1:maxDim) { ## work up through dimensions
for (i in 1:m) { ## work through smooths
if (sm[[i]]$dim == d&&sm[[i]]$side.constrain) { ## check for nesting
if (with.pen) X1 <- matrix(c(rep(1,nobs),rep(0,np)),nobs+np,as.integer(intercept)) else
X1 <- matrix(1,nobs,as.integer(intercept))
X1comp <- rep(0,0) ## list of components of X1 to avoid duplication
for (j in 1:d) { ## work through variables
b <- sm.id[[sm[[i]]$vn[j]]] # list of smooths dependent on this variable
k <- (1:length(b))[b==i] ## locate current smooth in list
if (k>1) for (l in 1:(k-1)) if (!b[l] %in% X1comp) { ## collect X columns
X1comp <- c(X1comp,b[l]) ## keep track of components to avoid adding same one twice
if (with.pen) { ## need to use augmented model matrix in testing
if (is.null(sm[[b[l]]]$Xa)) sm[[b[l]]]$Xa <- augment.smX(sm[[b[l]]],nobs,np)
X1 <- cbind(X1,sm[[b[l]]]$Xa)
} else X1 <- cbind(X1,sm[[b[l]]]$X) ## penalties not considered
}
} ## Now X1 contains columns for all lower dimensional terms
if (ncol(X1)==as.integer(intercept)) ind <- NULL else {
if (with.pen) {
if (is.null(sm[[i]]$Xa)) sm[[i]]$Xa <- augment.smX(sm[[i]],nobs,np)
ind <- fixDependence(X1,sm[[i]]$Xa,tol=tol)
} else ind <- fixDependence(X1,sm[[i]]$X,tol=tol)
}
## ... the columns to zero to ensure independence
if (!is.null(ind)) {
sm[[i]]$X <- sm[[i]]$X[,-ind]
## work through list of penalty matrices, applying constraints...
nsmS <- length(sm[[i]]$S)
if (nsmS>0) for (j in nsmS:1) { ## working down so that dropping is painless
sm[[i]]$S[[j]] <- sm[[i]]$S[[j]][-ind,-ind]
if (sum(sm[[i]]$S[[j]]!=0)==0) rank <- 0 else
rank <- qr(sm[[i]]$S[[j]],tol=tol,LAPACK=FALSE)$rank
sm[[i]]$rank[j] <- rank ## replace previous rank with new rank
if (rank == 0) { ## drop the penalty
sm[[i]]$rank <- sm[[i]]$rank[-j]
sm[[i]]$S[[j]] <- NULL
sm[[i]]$S.scale <- sm[[i]]$S.scale[-j]
if (!is.null(sm[[i]]$L)) sm[[i]]$L <- sm[[i]]$L[-j,,drop=FALSE]
}
} ## penalty matrices finished
## Now we need to establish null space rank for the term
mi <- length(sm[[i]]$S)
if (mi>0) {
St <- sm[[i]]$S[[1]]/norm(sm[[i]]$S[[1]],type="F")
if (mi>1) for (j in 1:mi) St <- St +
sm[[i]]$S[[j]]/norm(sm[[i]]$S[[j]],type="F")
es <- eigen(St,symmetric=TRUE,only.values=TRUE)
sm[[i]]$null.space.dim <- sum(es$values<max(es$values)*.Machine$double.eps^.75)
} ## rank found
if (!is.null(sm[[i]]$L)) {
ind <- as.numeric(colSums(sm[[i]]$L!=0))!=0
sm[[i]]$L <- sm[[i]]$L[,ind,drop=FALSE] ## retain only those sps that influence something!
}
sm[[i]]$df <- ncol(sm[[i]]$X)
attr(sm[[i]],"del.index") <- ind
## Now deal with case in which prediction constraints differ from fit constraints
if (!is.null(sm[[i]]$Xp)) { ## need to get deletion indices under prediction parameterization
## Note that: i) it doesn't matter what the identifiability con on X1 is
## ii) the degree of rank deficiency can't be changed by an identifiability con
if (with.pen) {
smi <- sm[[i]] ## clone smooth
smi$X <- smi$Xp ## copy prediction Xp to X slot.
smi$S <- smi$Sp ## and make sure penalty parameterization matches.
Xpa <- augment.smX(smi,nobs,np)
ind <- fixDependence(X1,Xpa,rank.def=length(ind))
} else ind <- fixDependence(X1,sm[[i]]$Xp,rank.def=length(ind))
sm[[i]]$Xp <- sm[[i]]$Xp[,-ind,drop=FALSE]
attr(sm[[i]],"del.index") <- ind ## over-writes original
}
}
sm[[i]]$vn <- NULL
} ## end if (sm[[i]]$dim == d)
} ## end i in 1:m loop
}
if (with.pen) for (i in 1:m) { ## remove working variables that were added
sm[[i]]$p.ind <- NULL
if (!is.null(sm[[i]]$Xa)) sm[[i]]$Xa <- NULL
}
sm
} ## gam.side
clone.smooth.spec <- function(specb,spec) {
## produces a version of base smooth.spec, `specb', but with
## the variables relating to `spec'. Used by `gam.setup' in handling
## of linked smooths.
## check dimensions same...
if (specb$dim!=spec$dim) stop("`id' linked smooths must have same number of arguments")
## Now start cloning...
if (inherits(specb,c("tensor.smooth.spec","t2.smooth.spec"))) { ##`te' or `t2' generated base smooth.spec
specb$term <- spec$term
specb$label <- spec$label
specb$by <- spec$by
k <- 1
for (i in 1:length(specb$margin)) {
if (is.null(spec$margin)) { ## sloppy user -- have to construct margin info...
for (j in 1:length(specb$margin[[i]]$term)) {
specb$margin[[i]]$term[j] <- spec$term[k]
k <- k + 1
}
specb$margin[[i]]$label <- ""
} else { ## second term was at least `te'/`t2', so margin cloning is easy
specb$margin[[i]]$term <- spec$margin[[i]]$term
specb$margin[[i]]$label <- spec$margin[[i]]$label
specb$margin[[i]]$xt <- spec$margin[[i]]$xt
}
}
} else { ## `s' generated case
specb$term <- spec$term
specb$label <- spec$label
specb$by <- spec$by
specb$xt <- spec$xt ## don't generally know what's in here => don't clone
}
specb ## return clone
} ## clone.smooth.spec
parametricPenalty <- function(pterms,assign,paraPen,sp0) {
## routine to process any penalties on the parametric part of the model.
## paraPen is a list whose items have names corresponding to the
## term.labels in pterms. Each list item may have named elements
## L, rank and sp. All other elements should be penalty coefficient matrices.
S <- list() ## penalty matrix list
off <- rep(0,0) ## offset array
rank <- rep(0,0) ## rank array
sp <- rep(0,0) ## smoothing param array
full.sp.names <- rep("",0) ## names for sp's multiplying penalties (not underlying)
L <- matrix(0,0,0)
k <- 0
tind <- unique(assign) ## unique term indices
n.t <- length(tind)
if (n.t>0) for (j in 1:n.t) if (tind[j]>0) {
term.label <- attr(pterms[tind[j]],"term.label")
P <- paraPen[[term.label]] ## get any penalty information for this term
if (!is.null(P)) { ## then there is information
ind <- (1:length(assign))[assign==tind[j]] ## index of coefs involved here
Li <- P$L;P$L <- NULL
spi <- P$sp;P$sp <- NULL
ranki <- P$rank;P$rank <- NULL
## remaining terms should be penalty matrices...
np <- length(P)
if (!is.null(ranki)&&length(ranki)!=np) stop("`rank' has wrong length in `paraPen'")
if (np) for (i in 1:np) { ## unpack penalty matrices, offsets and ranks
k <- k + 1
S[[k]] <- P[[i]]
off[k] <- min(ind) ## index of first coef penalized by this term
if ( ncol(P[[i]])!=nrow(P[[i]])||nrow(P[[i]])!=length(ind)) stop(" a parametric penalty has wrong dimension")
if (is.null(ranki)) {
ev <- eigen(S[[k]],symmetric=TRUE,only.values=TRUE)$values
rank[k] <- sum(ev>max(ev)*.Machine$double.eps*10) ## estimate rank
} else rank[k] <- ranki[i]
}
## now deal with L matrices
if (np) { ## only do this stuff if there are any penalties!
if (is.null(Li)) Li <- diag(np)
if (nrow(Li)!=np) stop("L has wrong dimension in `paraPen'")
L <- rbind(cbind(L,matrix(0,nrow(L),ncol(Li))),
cbind(matrix(0,nrow(Li),ncol(L)),Li))
ind <- (length(sp)+1):(length(sp)+ncol(Li))
ind2 <- (length(sp)+1):(length(sp)+nrow(Li)) ## used to produce names for full sp array
if (is.null(spi)) {
sp[ind] <- -1 ## auto-initialize
} else {
if (length(spi)!=ncol(Li)) stop("`sp' dimension wrong in `paraPen'")
sp[ind] <- spi
}
## add smoothing parameter names....
if (length(ind)>1) names(sp)[ind] <- paste(term.label,ind-ind[1]+1,sep="")
else names(sp)[ind] <- term.label
if (length(ind2)>1) full.sp.names[ind2] <- paste(term.label,ind2-ind2[1]+1,sep="")
else full.sp.names[ind2] <- term.label
}
} ## end !is.null(P)
} ## looped through all terms
if (k==0) return(NULL)
if (!is.null(sp0)) {
if (length(sp0)<length(sp)) stop("`sp' too short")
sp0 <- sp0[1:length(sp)]
sp[sp<0] <- sp0[sp<0]
}
## S is list of penalty matrices, off[i] is index of first coefficient penalized by each S[[i]]
## sp is array of underlying smoothing parameter (-ve to estimate), L is matrix mapping log
## underlying smoothing parameters to log smoothing parameters, rank[i] is the rank of S[[i]].
list(S=S,off=off,sp=sp,L=L,rank=rank,full.sp.names=full.sp.names)
} ## parametricPenalty
getNumericResponse <- function(form) {
## takes a formula for which the lhs contains numbers, but no variable
## names, and returns an array of those numbers. For example if 'form'
## is '1+26~s(x)', this will return the numeric vector c(1,26).
## A zero length vector is returned if the lhs contains no numbers,
## or contains variable names.
## This is useful for setting up models in which
## multiple linear predictors share terms. The lhs numbers can then
## indicate which linear predictors the rhs will contribute to.
## if the response contains variables or there is no
## response then return nothing...
if (length(form)==2||length(all.vars(form[[2]]))>0) return(rep(0,0))
## deparse turns lhs into a string; strsplit extracts the characters
## corresponding to numbers; unlist deals with the fact that deparse
## will split long lines resulting in multiple list items from
## strsplit; as.numeric converts the numbers; na.omit drops NAs
## resulting from "" elements; unique & round are obvious...
round(unique(na.omit(as.numeric(unlist(strsplit(deparse(form[[2]]), "[^0-9]+"))))))
} ## getNumericResponse
olid <- function(X,nsdf,pstart,flpi,lpi) {
## X is a model matrix, made up of nf=length(nsdf) column blocks.
## The ith block starts at column pstart[i] and its first nsdf[i]
## columns are unpenalized. X is used to define nlp=length(lpi)
## linear predictors. lpi[[i]] gives the columns of X used in the
## ith linear predictor. flpi[j] gives the linear predictor(s)
## to which the jth block of X belongs. The problem is that the
## unpenalized blocks need not be identifiable when used in combination.
## This function returns a vector dind of columns of X to drop for
## identifiability, along with modified lpi, pstart and nsdf vectors.
nlp <- length(lpi) ## number of linear predictors
n <- nrow(X)
nf <- length(nsdf) ## number of formulae blocks
Xp <- matrix(0,n*nlp,sum(nsdf))
start <- 1
ii <- 1:n
tind <- rep(0,0) ## complete index of all parametric columns in X
## create a block matrix, Xp, with the same identifiability properties as
## unpenalized part of model...
for (i in 1:nf) {
stop <- start - 1 + nsdf[i]
if (stop>=start) {
ind <- pstart[i] + 1:nsdf[i] - 1
for (k in flpi[[i]]) {
Xp[ii+(k-1)*n,start:stop] <- X[,ind]
}
tind <- c(tind,ind)
start <- start + nsdf[i]
}
}
## rank deficiency of Xp will reveal number of redundant parametric
## terms, and a pivoted QR will reveal which to drop to restore
## full rank...
qrx <- qr(Xp,LAPACK=TRUE,tol=0.0) ## unidentifiable columns get pivoted to final cols
r <- Rrank(qr.R(qrx)) ## get rank from R factor of pivoted QR
if (r==ncol(Xp)) { ## full rank, all fine, drop nothing
dind <- rep(0,0)
} else { ## reduced rank, drop some columns
dind <- tind[sort(qrx$pivot[(r+1):ncol(X)],decreasing=TRUE)] ## columns to drop
## now we need to adjust nsdf, pstart and lpi
for (d in dind) { ## working down through drop indices
## following commented out code is useful should it ever prove necessary to
## adjust pstart and nsdf, but at present these are only used in prediction,
## and it is cleaner to leave them unchanged, and simply drop using dind during prediction.
#k <- if (d>=pstart[nf]) nlp else which(d >= pstart[1:(nf-1)] & d < pstart[2:nf])
#nsdf[k] <- nsdf[k] - 1 ## one less unpenalized column in this block
#if (k<nf) pstart[(k+1):nf] <- pstart[(k+1):nf] - 1 ## later block starts move down 1
for (i in 1:nlp) {
k <- which(d == lpi[[i]])
if (length(k)>0) lpi[[i]] <- lpi[[i]][-k] ## drop row
k <- which(lpi[[i]]>d)
if (length(k)>0) lpi[[i]][k] <- lpi[[i]][k] - 1 ## close up
}
} ## end of drop index loop
}
list(dind=dind,lpi=lpi) ##,pstart=pstart,nsdf=nsdf)
} ## olid
gam.setup.list <- function(formula,pterms,
data=stop("No data supplied to gam.setup"),knots=NULL,sp=NULL,
min.sp=NULL,H=NULL,absorb.cons=TRUE,sparse.cons=0,select=FALSE,idLinksBases=TRUE,
scale.penalty=TRUE,paraPen=NULL,gamm.call=FALSE,drop.intercept=NULL,apply.by=TRUE,modCon=0) {
## version of gam.setup for when gam is called with a list of formulae,
## specifying several linear predictors...
## key difference to gam.setup is an attribute to the model matrix, "lpi", which is a list
## of column indices for each linear predictor
if (!is.null(paraPen)) stop("paraPen not supported for multi-formula models")
if (!absorb.cons) stop("absorb.cons must be TRUE for multi-formula models")
d <- length(pterms) ## number of formulae
if (is.null(drop.intercept)) drop.intercept <- rep(FALSE, d)
if (length(drop.intercept) != d) stop("length(drop.intercept) should be equal to number of model formulas")
lp.overlap <- if (formula$nlp<d) TRUE else FALSE ## predictors share terms
G <- gam.setup(formula[[1]],pterms[[1]],
data,knots,sp,min.sp,H,absorb.cons,sparse.cons,select,
idLinksBases,scale.penalty,paraPen,gamm.call,drop.intercept[1],apply.by=apply.by,list.call=TRUE,modCon=modCon)
G$pterms <- pterms
G$offset <- list(G$offset)
#G$contrasts <- list(G$contrasts)
G$xlevels <- list(G$xlevels)
G$assign <- list(G$assign)
used.sp <- length(G$lsp0)
if (!is.null(sp)&&used.sp>0) sp <- sp[-(1:used.sp)] ## need to strip off already used sp's
if (!is.null(min.sp)&&nrow(G$L)>0) min.sp <- min.sp[-(1:nrow(G$L))]
## formula[[1]] always relates to the base formula of the first linear predictor...
flpi <- lpi <- list()
for (i in 1:formula$nlp) lpi[[i]] <- rep(0,0)
lpi[[1]] <- 1:ncol(G$X) ## lpi[[j]] is index of cols for jth linear predictor
flpi[[1]] <- formula[[1]]$lpi ## used in identifiability testing by olid, later
pof <- ncol(G$X) ## counts the model matrix columns produced so far
pstart <- rep(0,d) ## indexes where parameteric columns start in each formula block of X
pstart[1] <- 1
if (d>1) for (i in 2:d) {
if (is.null(formula[[i]]$response)) { ## keep gam.setup happy
formula[[i]]$response <- formula$response
mv.response <- FALSE
} else mv.response <- TRUE
#spind <- if (is.null(sp)) 1 else (length(G$S)+1):length(sp)
formula[[i]]$pfok <- 1 ## empty formulae OK here!
um <- gam.setup(formula[[i]],pterms[[i]],
data,knots,sp,min.sp,#sp[spind],min.sp[spind],
H,absorb.cons,sparse.cons,select,
idLinksBases,scale.penalty,paraPen,gamm.call,drop.intercept[i],apply.by=apply.by,list.call=TRUE,modCon=modCon)
used.sp <- length(um$lsp0)
if (!is.null(sp)&&used.sp>0) sp <- sp[-(1:used.sp)] ## need to strip off already used sp's
if (!is.null(min.sp)&&nrow(um$L)>0) min.sp <- min.sp[-(1:nrow(um$L))]
flpi[[i]] <- formula[[i]]$lpi
for (j in formula[[i]]$lpi) { ## loop through all l.p.s to which this term contributes
lpi[[j]] <- c(lpi[[j]],pof + 1:ncol(um$X)) ## add these cols to lpi[[j]]
##lpi[[i]] <- pof + 1:ncol(um$X) ## old code
}
if (mv.response) G$y <- cbind(G$y,um$y)
if (i>formula$nlp&&!is.null(um$offset)) {
stop("shared offsets not allowed")
}
G$offset[[i]] <- um$offset
#G$contrasts[[i]] <- um$contrasts
if (!is.null(um$contrasts)) G$contrasts <- c(G$contrasts,um$contrasts)
G$xlevels[[i]] <- um$xlevels
G$assign[[i]] <- um$assign
G$rank <- c(G$rank,um$rank)
pstart[i] <- pof+1
G$X <- cbind(G$X,um$X) ## extend model matrix
## deal with the smooths...
k <- G$m
if (um$m) for (j in 1:um$m) {
um$smooth[[j]]$first.para <- um$smooth[[j]]$first.para + pof
um$smooth[[j]]$last.para <- um$smooth[[j]]$last.para + pof
k <- k + 1
G$smooth[[k]] <- um$smooth[[j]]
}
## L, S and off...
ks <- length(G$S)
M <- length(um$S)
if (!is.null(um$L)||!is.null(G$L)) {
if (is.null(G$L)) G$L <- diag(1,nrow=ks)
if (is.null(um$L)) um$L <- diag(1,nrow=M)
G$L <- rbind(cbind(G$L,matrix(0,nrow(G$L),ncol(um$L))),cbind(matrix(0,nrow(um$L),ncol(G$L)),um$L))
}
G$off <- c(G$off,um$off+pof)
if (M) for (j in 1:M) {
ks <- ks + 1
G$S[[ks]] <- um$S[[j]]
}
G$m <- G$m + um$m ## number of smooths
##G$nsdf <- G$nsdf + um$nsdf ## or list??
G$nsdf[i] <- um$nsdf
if (!is.null(um$P)||!is.null(G$P)) {
if (is.null(G$P)) G$P <- diag(1,nrow=pof)
k <- ncol(um$X)
if (is.null(um$P)) um$P <- diag(1,nrow=k)
G$P <- rbind(cbind(G$P,matrix(0,pof,k)),cbind(matrix(0,k,pof),um$P))
}
G$cmX <- c(G$cmX,um$cmX)
if (um$nsdf>0) um$term.names[1:um$nsdf] <- paste(um$term.names[1:um$nsdf],i-1,sep=".")
G$term.names <- c(G$term.names,um$term.names)
G$lsp0 <- c(G$lsp0,um$lsp0)
G$sp <- c(G$sp,um$sp)
pof <- ncol(G$X)
} ## formula loop end
## If there is overlap then there is a danger of lack of identifiability of the
## parameteric terms, especially if there are factors present in shared components.
## The following code deals with this possibility...
if (lp.overlap) {
rt <- olid(G$X,G$nsdf,pstart,flpi,lpi)
if (length(rt$dind)>0) { ## then columns have to be dropped
warning("dropping unidentifiable parametric terms from model",call.=FALSE)
G$X <- G$X[,-rt$dind] ## drop cols
G$cmX <- G$cmX[-rt$dind]
G$term.names <- G$term.names[-rt$dind]
## adjust indexing in smooth list, noting that coefs of smooths
## are never dropped by dind
for (i in 1:length(G$smooth)) {
k <- sum(rt$dind < G$smooth[[i]]$first.para)
G$smooth[[i]]$first.para <- G$smooth[[i]]$first.para - k
G$smooth[[i]]$last.para <- G$smooth[[i]]$last.para - k
}
for (i in 1:length(G$off)) G$off[i] <- G$off[i] - sum(rt$dind < G$off[i])
## replace various indices with updated versions...
# pstart <- rt$pstart; G$nsdf <- rt$nsdf ## these two only needed in predict.gam - cleaner to leave unchanged
lpi <- rt$lpi
attr(G$nsdf,"drop.ind") <- rt$dind ## store drop index
}
}
attr(lpi,"overlap") <- lp.overlap
attr(G$X,"lpi") <- lpi
attr(G$nsdf,"pstart") <- pstart ##unlist(lapply(lpi,min))
## assemble a global indicator array for non-linear parameters...
G$g.index <- rep(FALSE,ncol(G$X))
n.sp0 <- 0
if (length(G$smooth)) for (i in 1:length(G$smooth)) {
if (!is.null(G$smooth[[i]]$g.index)) G$g.index[G$smooth[[i]]$first.para:G$smooth[[i]]$last.para] <- G$smooth[[i]]$g.index
n.sp <- length(G$smooth[[i]]$S)
if (n.sp) {
G$smooth[[i]]$first.sp <- n.sp0 + 1
n.sp0 <- G$smooth[[i]]$last.sp <- n.sp0 + n.sp
}
}
if (!any(G$g.index)) G$g.index <- NULL
G
} ## gam.setup.list
gam.setup <- function(formula,pterms,
data=stop("No data supplied to gam.setup"),knots=NULL,sp=NULL,
min.sp=NULL,H=NULL,absorb.cons=TRUE,sparse.cons=0,select=FALSE,idLinksBases=TRUE,
scale.penalty=TRUE,paraPen=NULL,gamm.call=FALSE,drop.intercept=FALSE,
diagonal.penalty=FALSE,apply.by=TRUE,list.call=FALSE,modCon=0)
## set up the model matrix, penalty matrices and auxilliary information about the smoothing bases
## needed for a gam fit.
## elements of returned object:
## * m - number of smooths
## * min.sp - minimum smoothing parameters
## * H supplied H matrix
## * pearson.extra, dev.extra, n.true --- entries to hold these quantities
## * pterms - terms object for parametric terms
## * intercept TRUE if intercept present
## * offset - the model offset
## * nsdf - number of strictly parameteric coefs
## * contrasts
## * xlevels - records levels of factors
## * assign - indexes which parametric model matrix columns map to which term in pterms
## * smooth - list of smooths
## * S - penalties (non-zero block only)
## * off - first coef penalized by each element of S
## * cmX - col mean of X
## * P - maps parameters in fit constraint parameterization to those in prediction parameterization
## * X - model matrix
## * sp
## * rank
## * n.paraPen
## * L
## * lsp0
## * y - response
## * C - constraint matrix - only if absorb.cons==FALSE
## * n - dim(y)
## * w - weights
## * term.names
## * nP
{ # split the formula if the object being passed is a formula, otherwise it's already split
if (inherits(formula,"split.gam.formula")) split <- formula else
if (inherits(formula,"formula")) split <- interpret.gam(formula)
else stop("First argument is no sort of formula!")
if (length(split$smooth.spec)==0) {
if (split$pfok==0) stop("You've got no model....")
m <- 0
} else m <- length(split$smooth.spec) # number of smooth terms