diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..07ea26e --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,14 @@ +Package: greyzoneSurv +Type: Package +Title: Fit a Grey-Zone Model with Survival Data +Version: 1.0 +Date: 2015-05-18 +Author: Pingping Qu and John Crowley +Maintainer: Pingping Qu +Description: Allows one to classify patients into low, intermediate, and high risk groups for disease progression based on a continuous marker that is associated with progression-free survival. It uses a latent class model to link the marker and survival outcome and produces two cutoffs for the marker to divide patients into three groups. See the References section for more details. +License: GPL-3 +Depends: stats4, survival, Hmisc, survAUC +NeedsCompilation: no +Packaged: 2015-05-19 06:40:41 UTC; pingpingq +Repository: CRAN +Date/Publication: 2015-05-19 09:25:40 diff --git a/MD5 b/MD5 new file mode 100644 index 0000000..a840e38 --- /dev/null +++ b/MD5 @@ -0,0 +1,11 @@ +d26a852fd000d99e114b7b96423e7275 *DESCRIPTION +3f96d85ea4c670cad90d36a94236be1d *NAMESPACE +413355ac08f62c2d3b4f8ff6aabe1342 *R/greyzoneSurvpackage.R +348574957b97c49564b4c72e23db7085 *build/partial.rdb +33e9866b86d76e8ff1e8f83cecece3dd *data/mydata.rda +a345176d7b3c556820b111cac7f1332b *man/all.greyzone.funcs.Rd +3c5d56eab903f2e565847d2be684307f *man/bestcut2.Rd +5362413efff6b388acc978123714519b *man/cox.summary.Rd +5f973f94fb00e35886dd15a97a02d17d *man/genSurvData.Rd +28027143a08983f60a5b589675b04fdf *man/greyzoneSurv-package.Rd +cc70c7ec7cf9f81d72c5adbfb3043d0d *man/mydata.Rd diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..8687641 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,8 @@ +#exportPattern("^[[:alpha:]]+") +export(em.func) +export(cov.func) +export(greyzone.func) +export(genSurvData) +export(bestcut2) +export(cox.summary) +import(stats4, survival, Hmisc, survAUC) \ No newline at end of file diff --git a/R/greyzoneSurvpackage.R b/R/greyzoneSurvpackage.R new file mode 100644 index 0000000..ebd97a0 --- /dev/null +++ b/R/greyzoneSurvpackage.R @@ -0,0 +1,530 @@ + +# packages=c('stats4', 'survival', 'Hmisc', 'survAUC') +# for (package in packages) +# { +# if(package %in% rownames(installed.packages())) +# do.call('library', list(package)) +# +# # if package is not installed locally, download, then load +# else { +# install.packages(package) +# do.call("library", list(package)) +# } +# +# } + +genSurvData=function(n, recruitment.yrs=2, baseline.hazard=365.25*5, shape=1, censoring.rate=0, beta.continuous, beta.binary=0, x, xhigh, ran.seed) +{ + #### This function generates weibull survival times with a fixed censoring rate. + #### In the R rweibull function, the scale is different than the usual lamda used for the scale parameter. + #### In the usual parameterization, the bigger the lamda, the smaller the mean of the survival time. + #### Here in rweibull, the bigger the scale the bigger the mean of the survival time. + + + ###### simulate exponential survival times + lambdaT=baseline.hazard + + #true event time for low risk group, note that lambdaT is the mean of survival times when covariate=0 + set.seed(ran.seed) + T0 = floor(rweibull(sum(xhigh==0), shape=shape, scale=lambdaT*exp(-beta.continuous*x[xhigh==0]))) + #true event time for high risk group + set.seed(ran.seed) + T1=floor(rweibull(sum(xhigh==1), shape=shape, scale=lambdaT*exp(-beta.continuous*x[xhigh==1]-beta.binary))) + T=c(T0, T1); T + + if (censoring.rate==0) + { + event=rep(1, n) + time=T + + } else + { + #simulate the dates when patients enter the study + set.seed(ran.seed) + ondt=sample.int(ceiling(recruitment.yrs*365.25),size=n); ondt + + #number of events at minimum (when there are ties there might be more events therefore fewer censored) + n.events0=n-floor(n*censoring.rate); n.events0 + #what patients to censor? Note we need to get at least n.events0 events (it could be more because of tied survival times) + tot=ondt+T + tot.ordered=sort(tot) + #want to censor after tot.ordered[n.events0], so need to randomly choose a day as stopping-study date + #between tot.ordered[n.events0] and tot.ordered[n.events0+1]-1 + #set.seed(ran.seed) + tot.ordered[n.events0] + tot.ordered[n.events0+1] + num.dup=sum(tot==tot.ordered[n.events0]); num.dup#this tells the number of duplicated values at where we want to stop the study + days.to.choose=seq(tot.ordered[n.events0],tot.ordered[n.events0+num.dup]-1, by=1) + if (length(days.to.choose)>1) stop.dt=sample(days.to.choose, size=1) else stop.dt=tot.ordered[n.events0] #this is study-stopping date + stop.dt + + event=(T+ondt<=stop.dt)*1 + time=pmin(T+ondt, stop.dt)-ondt + + } + + return(out=list(data=data.frame(time, event, x, xhigh), censoring.rate=mean(event==0))) + +} + + +em.func=function(error=.001, max.iter=300, initial.values=NULL, + y, delta, x) +{ + + ############################################# + # EM to get parameter estimates from joint + # Weilbull and logistic likelihood function + ############################################# + + # Meanings of the function arguments: + # z is a vector of whether a patient is high risk (=1) or low risk (=0) + # lamda is a value + # beta=c(beta0, beta1) + # gamma=c(gamma0, gamma1) + # y is a vector of survival time + # delta is a vector of censoring indicator + # x is a nx2 matrix, with the 1st column being 1's and the 2nd column the score + + ##--------------- define sub functions + + hazard.given.z=function(t, z, lamda, beta) + { + # t is survival time (either censored time or event time) + # z is the latent variable indicating whether patient is in group 1 (high risk) or 0 (low risk) + # lamda is the shape parameter for weibull distribution + # beta=c(beta0, beta1), where beta0 is for the intercept and beta1 is for z + + return(lamda*t^(lamda-1)*exp(beta[1]+beta[2]*z)) + + } + + surv.given.z=function(t, z, lamda, beta) + { + # t is survival time (either censored time or event time) + # z is the latent variable indicating whether patient is in group 1 or 0 + # lamda is the shape parameter for weibull distribution + # beta=c(beta0, beta1) + + return(exp(-t^lamda*exp(beta[1]+beta[2]*z))) + + } + + z.func=function(z, x, gamma) + { + # this gives probability of z, given x and parameter gamma + # z can be 1, indicating high risk, or 0, indicating low risk + # x=c(1, s), where s is the score value + # gamma=c(gamma0, gamma1) + + return(exp(x%*%gamma)^z/(1+exp(x%*%gamma))) + + } + + + like.given.z=function(y, delta, z, lamda, beta) + { + + return(hazard.given.z(y, z, lamda, beta)^delta*surv.given.z(y, z, lamda, beta)) + + } + + + jointlike.func=function(y, delta, lamda, beta, z, x, gamma) + { + # this is the joint likelihood of y, delta, and z, where y has weibull and z has binomial distribution + # z is the latent variable indicating whether patient is in group 1 or 0 + # y is survival time (either censored time or event time) + # delta is censoring variable indicating whether patient with survival time y had an event (=1) or censored (=0) + + return(like.given.z(y, delta, z, lamda, beta)*z.func(z, x, gamma)) + + } + + nQ.func=function(lam, b0, b1, g0, g1) + { + # this function is to calculate negative Q function for all the patients + # the unknown variables are lam, b0, b1, g0, g1 + # fixed variables include y, delta, x and exp.p, where exp.p is expected probability + # of high risk given data and current parameter estimate + + tmp=cbind(1, exp.p)%*%c(b0,b1) + + tmp1=delta%*%(log(lam)+(lam-1)*log(y)+tmp) + tmp2=-exp(b0)*y^lam%*%as.numeric(exp(b1)*exp.p+(1-exp.p)) + tmp3=as.numeric(exp.p)%*%as.numeric(log(z.func(1, x, c(g0, g1))))+as.numeric(1-exp.p)%*%as.numeric(log(1-z.func(1, x, c(g0, g1)))) + + mysum=(tmp1+tmp2+tmp3)*(-1) + return(mysum) + + } + + cat('\n....calculating parameter estimates using EM') + + ##extract initial values from the function arguments + n=length(y) + if (!is.null(initial.values)) + { + z=initial.values[[1]] + lamda=initial.values[[2]] + beta=initial.values[[3]] + gamma=initial.values[[4]] + } else + { + z=rep(0,n) + z[x[,2]>=quantile(x[,2], 0.75)]=1 + lamda=1 + beta=c(0, 0.5) + gamma=c(0, 0.5) + + } + p.lamda=1 + p.beta=length(beta) + p.gamma=length(gamma) + p=p.lamda+p.beta+p.gamma #total dimension of unknown parameters + + k=0 + diff=.1 + loglike=sum(log(jointlike.func(y, delta, lamda, beta, 1, x, gamma)+jointlike.func(y, delta, lamda, beta, 0, x, gamma))) + # both num and denom are vectors + num=jointlike.func(y, delta, lamda, beta, 1, x, gamma) + denom=jointlike.func(y, delta, lamda, beta, 1, x, gamma)+jointlike.func(y, delta, lamda, beta, 0, x, gamma) + exp.p=num/denom + # a vector that tells the probability of high risk for each patient + summary(exp.p) + iter.results=c(k, lamda, beta, gamma, loglike, diff) + iter.results + em.error=NA + + while ((diff>error) & (k0.01)) em.error='yes' + + ##above produced the final estimtes: lamda, gamma, beta, loglike, and + ##vcov as the final covariance matrix for the final estimates based on the nQ function; + ##it also produced exp.p, which is from the last iteration + + colnames(iter.results)=c("iteration", "lamda", + paste("b", 0:(p.beta-1), sep=''), + paste("g", 0:(p.gamma-1), sep=''), "loglike", "diff") + iter.results[1, 'diff']=NA + + if (is.na(em.error)) + { + par.names=c('lamda', 'b0', 'b1', 'g0', 'g1') + rownames(vcov)=colnames(vcov)=par.names + + } else if (em.error=='yes') + { + vcov=NA + } + + out=list(em.iterations=iter.results, estimates.p=as.numeric(exp.p), nQ.vcov=vcov, em.error=em.error) + return(out) +} + + +cov.func=function(em.results, y, delta, x) +{ + ################################# + # get covariance matrix using + # louis' formula with simulations. + # need to use the EM estimates + ################################# + + cat('\n....calculating covariance matrix using Lous method and simulations') + + n=length(y); n + estimates=(em.results$em.iterations)[nrow(em.results$em.iterations), c('lamda', 'b0', 'b1', 'g0', 'g1')] + estimates + exp.p=em.results$estimates.p + vcov=em.results$nQ.vcov + vcov + + lamda=estimates['lamda']; p.lamda=length(lamda) + beta=estimates[c('b0', 'b1')]; p.beta=length(beta) + gamma=estimates[c('g0', 'g1')]; p.gamma=length(gamma) + b0=beta[1] + b1=beta[2] + g0=gamma[1] + g1=gamma[2] + + ## now having got vcov as the covariance matrix of the nQ function + ## (minus complete loglikelihood), which is the first term in Lou's + ## formula. We need to calculate the 2nd term now, that is, we need to + ## calculate the covariance matrix of the H function with simulations + ## (see Tanner and Wang, page 77 using simulation method) + ## first generate z1,..., z10000, from bernoulli with probability of exp.p + ## remember exp.p is a vector of n + num.iter=10000 + z=matrix(NA, nrow=n, ncol=num.iter) + for (i in 1:n) + { + set.seed(i*1234) + z[i,]=rbinom(n=num.iter, size=1, prob=exp.p[i]) + } + + ## Blow are partial derivatives of log complete likelihood. + ## we want to simulate them wrt z, with probability exp.p from the last estimates + p1.lamda=rep(NA, num.iter) + p1.beta=matrix(NA, nrow=p.beta, ncol=num.iter) + p1.gamma=matrix(NA, nrow=p.gamma, ncol=num.iter) + p=p.lamda+p.beta+p.gamma + + pi.est=exp(x%*%gamma)/(1+exp(x%*%gamma)) + + for (k in 1:num.iter) + { + m=cbind(rep(1, n), z[,k]) + p1.lamda[k]=delta%*%(1/lamda+log(y))-(y^lamda*log(y))%*%exp(m%*%beta) + #first derivative of log of complete likelihood wrt lamda, + #notice difference between elementwise multiplication and vector operations + + p1.beta[,k]=matrix(delta-y^lamda*exp(m%*%beta), ncol=n)%*%m + #first derivative of beta + + p1.gamma[,k]=matrix(z[,k]-pi.est, ncol=n)%*%x + #first derivative of gamma + + } + + p1=rbind(p1.lamda, p1.beta, p1.gamma) ## nrow=p, ncol=num.iter + dim(p1) + tmp=cov(t(p1)); tmp + + solve(vcov) + solve(vcov)-tmp + final.vcov=solve(solve(vcov)-tmp); final.vcov + final.se=sqrt(diag(final.vcov)); final.se + + #estimate confidence interval using normal approximation + #(although lamda may not follow a normal approximation). + #Alternatively,can estimate log.lamda and then convert back to lamda + #Then do hypothesis testing to test whether lamda=1, + #and all other parameters =0 + ll=estimates-1.96*final.se + ul=estimates+1.96*final.se + ll #lower limt + ul + test=(estimates-c(1,0,0,0,0))/final.se + #standardize and do the tests individually + test + #P values using pnorm function, which uses lower tail as default + pvalue=NULL + pvalue[1]=2*(min(1-pnorm(test[1]), pnorm(test[1]))) + pvalue[2]=2*(min(1-pnorm(test[2]), pnorm(test[2]))) + pvalue[3]=2*(min(1-pnorm(test[3]), pnorm(test[3]))) + pvalue[4]=2*(min(1-pnorm(test[4]), pnorm(test[4]))) + pvalue[5]=2*(min(1-pnorm(test[5]), pnorm(test[5]))) + pvalue + + par.names=c('lamda', 'b0', 'b1', 'g0', 'g1') + + names(estimates)=par.names + names(ll)=par.names + names(ul)=par.names + names(pvalue)=par.names + names(final.se)=par.names + rownames(final.vcov)=colnames(final.vcov)=par.names + + out=list(par.est=estimates, final.vcov=final.vcov, final.se=final.se, + par.est.ll=ll, par.est.ul=ul, pvalue=pvalue, exp.p=exp.p) + return(out) +} + + +greyzone.func=function(cov.results, y, delta, x, plot.logistic=T) +{ + #################### + # find grey zone + #################### + + cat('\n....finding grey zone in logistic curves\n') + + estimates=cov.results$par.est + gamma=estimates[c('g0', 'g1')] + final.se=cov.results$final.se + final.vcov=cov.results$final.vcov + exp.p=cov.results$exp.p + + #first estimated logistic regression curve and draw it as well as its CI + pi.est=exp(x%*%gamma)/(1+exp(x%*%gamma)); summary(pi.est) + score=x[,2] + min.score=round(min(score),2) + max.score=round(max(score),2) + score.sim=seq(min.score,max.score,by=.01); length(score.sim) + x.sim=cbind(rep(1,length(score.sim)), score.sim) + #logistic curve based on score.sim (we want to get the curve at a more refined level than that based on score) + pi.est.sim=exp(x.sim%*%gamma)/(1+exp(x.sim%*%gamma)) + + #to find lower and upper bound for pi.est.sim curve, need to find estimated variance of g0+x*g1, or x.sim%*%gamma + est.var=final.vcov[4,4]+score.sim^2*final.vcov[5,5]+2*score.sim*final.vcov[4,5]; est.var[1:10] + est.sd=sqrt(est.var); est.sd[1:10] + + #95% CI of the curve of pi.est and using it to define a gray zone (third group) + pi.est.ul.sim=exp(x.sim%*%gamma+1.96*est.sd)/(1+exp(x.sim%*%gamma+1.96*est.sd)) + pi.est.ll.sim=exp(x.sim%*%gamma-1.96*est.sd)/(1+exp(x.sim%*%gamma-1.96*est.sd)) + max.score.sim.lowrisk=score.sim[max((1:length(score.sim))[pi.est.ul.sim<0.5])] + min.score.sim.highrisk=score.sim[min((1:length(score.sim))[pi.est.ll.sim>=0.5])] + max.score.sim.lowrisk # + min.score.sim.highrisk # + p.low=exp(c(1,max.score.sim.lowrisk)%*%gamma)/(1+exp(c(1,max.score.sim.lowrisk)%*%gamma)) + p.high=exp(c(1,min.score.sim.highrisk)%*%gamma)/(1+exp(c(1,min.score.sim.highrisk)%*%gamma)) + as.numeric(p.low) # + as.numeric(p.high) # + #so grey zone is >=max.score.sim.lowrisk and = min.score.sim.highrisk + #however, greyzone doesn't have to exist, that is, max.score.sim.lowrisk and min.score.sim.highrisk can be NA + + if (plot.logistic) + { + plot(x=score, y=exp.p, cex=.6, xlim=c(-2, 3)) + points(x=score.sim, y=pi.est.sim, lty=1, col='blue', type='l') + points(x=score.sim, y=pi.est.ul.sim, lty=2, col='blue', type='l') + points(x=score.sim, y=pi.est.ll.sim, lty=2, col='blue', type='l') + abline(h=0.5) + abline(v=c(max.score.sim.lowrisk, min.score.sim.highrisk), col='green', lty=2) + + } + + out=list(greyzone.ll=max.score.sim.lowrisk, greyzone.ul=min.score.sim.highrisk, + score=score, score.sim=score.sim, pi.est.sim=pi.est.sim, exp.p=exp.p, + pi.est.ul.sim=pi.est.ul.sim, pi.est.ll.sim=pi.est.ll.sim) + +} + + +cox.summary=function(stime, sind, var) +{ + + p=length(unique(var))-1 + + #calculate r2 + data=data.frame(stime, sind, var) + n=length(stime) + model0 <- coxph(Surv(stime, sind)~1) + model1 <- coxph(Surv(stime, sind)~as.factor(var)) + f0 <- rep(0,n) + f1 <- predict(model1, newdata=data) + Surv.res <- Surv(stime, sind) + r2.oxs=survAUC::OXS(Surv.res, f1, f0) + r2.nagelk=survAUC::Nagelk(Surv.res, f1, f0) + r2.xo=survAUC::XO(Surv.res, f1, f0) + + res=summary(model1); res + pvalue=as.numeric(res$logtest[3]) + aic=-2*res$loglik[2]+2*p + + #considering ties in x + c1=Hmisc::rcorrcens(Surv(stime, sind) ~ var); c1 + c1=1-c1[1,1];c1 + #ignores ties in x + c3=Hmisc::rcorr.cens(var, Surv(stime, sind), outx=T) + c3=1-c3['C Index'];c3 + + return(out=c(p=pvalue, AIC=aic, r2.oxs=r2.oxs, r2.xo=r2.xo, r2.nagelk=r2.nagelk, + c.ties=c1)) + +} + + +bestcut2=function(data=data, stime="surtim", sind="surind", var="score", leave=20) +{ + # this function finds the best cutoff value for a continuous score given survival data to divide patients into 2 groups + #leave is the minimum number of patients in each patient group + + cat('\nLooking for best cutoff to divide subjects into 2 groups...') + n=nrow(data) + ord=order(data[,var]); ord + score=data[ord,var]; score + + pvalue=chisq.stat=rep(NA, n) + for (i in (leave+1):(n-leave+1)) + { + ## i is the last position of the 1st cluster + + cluster=rep(0,n) + cluster[data[,var]>=score[i]]=1 + res=survdiff(Surv(data[,stime], data[,sind])~cluster) + pvalue[i]=1-pchisq(res$chisq, df=1) + chisq.stat[i]=res$chisq + + } + summary(-log10(pvalue)) + + ##### which cutoff gives the smallest p-value: best.i gives the answer + best.i=which.max(chisq.stat) + best.i + cutoff=score[best.i] + cat('\nbest cutoff =', cutoff) + + cluster = rep(0,n) + cluster[data[,var]>=cutoff]=1 + + survdata=data.frame(data, pvalue, chisq.stat, bestcut2=cluster, cutoff=rep(cutoff,n)) + +### are there pvalues significant after Bonferroni? + + totsig=sum(!is.na(pvalue)) + siglevel=-log10(.05/totsig) # the significance level after Bonferroni correction + siglevel + maxp=max(as.vector(-log10(pvalue)),na.rm=T) + maxp + cat('\nThere are p values significant after Bonferroni correction:', maxp>siglevel) + cat("\nP-value at best cutoff: ", pvalue[best.i], "\n\n") + + return(survdata) + +} + diff --git a/build/partial.rdb b/build/partial.rdb new file mode 100644 index 0000000..828cabb Binary files /dev/null and b/build/partial.rdb differ diff --git a/data/mydata.rda b/data/mydata.rda new file mode 100644 index 0000000..abdc28d Binary files /dev/null and b/data/mydata.rda differ diff --git a/man/all.greyzone.funcs.Rd b/man/all.greyzone.funcs.Rd new file mode 100644 index 0000000..8de716e --- /dev/null +++ b/man/all.greyzone.funcs.Rd @@ -0,0 +1,135 @@ +\name{greyzone.funcs} +%\alias{greyzone} +\alias{em.func} +\alias{cov.func} +\alias{greyzone.func} + +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +\packageTitle{greyzoneSurv} +%Find a Grey Zone Solution with Survival Data +} +\description{ +Find two cutoffs for a marker by fitting a grey-zone model that will define high, intermediate, and low risk groups when the outcome is survival. +} +\usage{ +em.func(error = 0.001, max.iter = 300, initial.values=NULL, y, delta, x) + +cov.func(em.results, y, delta, x) + +greyzone.func(cov.results, y, delta, x, plot.logistic=T) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ +\item{error}{ + Convergence criterion as largest difference in parameter estimates between two consecutive iterations +} + \item{max.iter}{ + Maximum number of iterations +} + \item{initial.values}{ + A list of initial parameters with such components as z, lamda, beta, and gamma with the default being NULL. + + z is an initial guess of dimension nx1 regarding whether each subject or patient is at high risk (=1) or low risk (=0); lamda is an initial guess about the scale parameter for a Weibull distribution (lamda=1 for an exponential distribution); beta is an initial guess on the regression coefficients which include an intercept and a slope linking the latent class variable and the survival outcome, and gamma is an initial guess on the regression coefficients which include an intercept and a slope linking the latent class variable and the marker values. +} + \item{y}{A nx1 vector of survival time in years +} + \item{delta}{A nx1 vector of censoring indicator +} + \item{x}{A nx2 matrix, with the 1st column being all 1s and the 2nd column being the marker values +} + +\item{em.results}{ + Results from calling em.func +} + \item{cov.results}{Results from calling cov.func +} + \item{plot.logistic}{A logical variable. If T, it will plot the fitted logistic function between the marker values and the fitted probability of high risk for each patient +} +} +\details{ +%% ~~ If necessary, more details than the description above ~~ +The package assumes higher marker values correspond to shorter survival times, so it is important to make sure this is the case with your data. If initial.values is NULL, the function will generate initial values automatically based on this assumption. +} +\value{ +The function \code{\link{em.func}} returns a list with components such as parameter estimates and number of iterations from the EM algorithm as well as whether the EM algorithm fitting generated an error. + +The function \code{\link{cov.func}} returns a list with components such as the variance-covariance matrix, standard errors and 95\% confidence limits for the parameter estimates estimated from calling \code{\link{em.func}}. An additional component it returns is the estimated probability of being at high risk for each patient given patient survival and marker data. + +The function \code{\link{greyzone.func}} returns a list with components such as the grey zone cutoffs greyzone.ll and greyzone.ul so that patient will be classified as low risk if marker value is < greyzone.ll, high risk if marker value is >= greyzone.ul, and intermediate risk (or in the grey zone) if marker value is >= greyzone.ll and < greyzone.ul. +} +%\references{ +%% ~put references to the literature/web site here ~ +%} +%\author{ +%% ~~who you are~~ +%} +%\note{ +%% ~~further notes~~ +%} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ +% +\seealso{ +%\code{\link{em.func}}, \code{\link{cov.func}} +\code{\link{genSurvData}}, \code{\link{cox.summary}} +} +\examples{ +#use the package data "mydata" to fit the grey-zone model in 3 steps +data(mydata) +dim(mydata) +head(mydata) + +#~step 1: extract information needed for fitting the model and +#make some initial guesses of some parameter values +n=nrow(mydata) +y=mydata$time/365.25 +delta=mydata$event +score=mydata$x +x=cbind(rep(1, n), score) + +#~step 2: get EM estimates and variance-covariance matrix +results.em=em.func(initial.values=NULL, y=y, delta=delta, x=x) +is.na(results.em$em.error) +#only if above is true, you should proceed; otherwise try a different set +#of intial values and try calling em.func again +names(results.em) +#after you successfully get an EM solution proceed to get the variance-covariance matrix +results.cov=cov.func(results.em, y=y, delta=delta, x=x) +names(results.cov) + +#~step 3: when there are no errors above proceed to calculate the grey zone +results=greyzone.func(results.cov, y, delta, x, plot.logistic=FALSE) +names(results) +!is.na(results$greyzone.ll) & !is.na(results$greyzone.ul) +#only when above is true, you have a grey-zone solution and you can proceed +#sometimes there is not a grey-zone solution even if the EM fitting is successful. +#In that case, it means the grey-zone model is not a good fit for the data. +score3=rep(0, n) +score3[score>=results$greyzone.ll & score=results$greyzone.ul]=2 +#if you get through steps 1-3, you are done fitting the grey-zone model! + +#now you may want to compare with a 2-group model, so fit a 2-group model +res=bestcut2(data=mydata, stime='time', sind='event', var='x') +score2=res[,'bestcut2'] + +#then compare the 2-group and grey-zone models, but note here we compare on the training data +#ideally we want to compare them on a test data set +cox.2group=cox.summary(stime=mydata$time, sind=mydata$event, var=score2) +cox.3group=cox.summary(stime=mydata$time, sind=mydata$event, var=score3) +cox.2group +cox.3group +fit.2group = survfit(Surv(mydata$time, mydata$event)~score2) +fit.3group = survfit(Surv(mydata$time, mydata$event)~score3) +par(mfrow=c(1,2)) +plot(fit.2group, lty=1:2, main='2-group model', las=1, + ylab='Probability of Survival', xlab='Days from Time 0') +plot(fit.3group, lty=1:3, main='grey-zone model', las=1, + ylab='Probability of Survival', xlab='Days from Time 0') + +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +%\keyword{EM algorithm } +%\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/man/bestcut2.Rd b/man/bestcut2.Rd new file mode 100644 index 0000000..497598c --- /dev/null +++ b/man/bestcut2.Rd @@ -0,0 +1,42 @@ +\name{bestcut2} +\alias{bestcut2} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Find an Optimal Cutoff for the 2-group Model +} +\description{ +This function uses a brute force method to search for the best cutoff value for a marker based on the log rank test to divide patients into high and low risk groups given survival data. +} +\usage{ +bestcut2(data, stime, sind, var, leave = 20) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{data}{A data frame or numerical matrix +} + \item{stime}{A character string that tells the column name for survival time in the data +} + \item{sind}{A character string that tells the column name for censoring indicator in the data +} + \item{var}{A character string that tells the column name for marker values in the data +} + \item{leave}{Minimum number of patients in the resulting high and low risk groups +} +} +\value{ +It returns a data frame with the input data as well as the final optimal high and low risk groupings saved in the column bestcut2 (1=high risk and 0=low risk). Additionally, it has columns such as the cutoff value for the marker, the chi-square statistics and the log rank p values for testing equality of survival in the resulting high and low risk groups from using each possible marker value as cutoff. +} +\examples{ +## Use the package data "mydata" to fit the 2-group model +data(mydata) +res=bestcut2(data=mydata, stime='time', sind='event', var='x') +table(res[,'bestcut2']) + +#compare the true groupings and that from the 2-group model +table(res[,c('xhigh', 'bestcut2')]) + +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +%\keyword{ ~kwd1 } +%\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/man/cox.summary.Rd b/man/cox.summary.Rd new file mode 100644 index 0000000..bbfc35f --- /dev/null +++ b/man/cox.summary.Rd @@ -0,0 +1,55 @@ +\name{cox.summary} +\alias{cox.summary} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{Summarize after Fitting Cox regression + +} +\description{ +A wrapper function to summarize 2-group and grey-zone (3-group) models with \eqn{R^2} and c-index after fitting Cox regression. +} +\usage{ +cox.summary(stime, sind, var) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{stime}{A nx1 vector of survival time +} + \item{sind}{A nx1 vector of censoring indicator +} + \item{var}{A nx1 vector of risk groupings to correlate with survival. For the 2-group and 3-group models, it is a categorical vector +} +} +%\details{ +%% ~~ If necessary, more details than the description above ~~ +%} +\value{ +It returns a vector with components such as the p value from fitting a Cox regression model, AIC (Akaike information criterion), and the c-index [1, 2], \eqn{R^2} from the XO method [3], the OXS method [4] and the Nagelkerke method. +} +\references{ +Harrell F, Califf R, Pryor D, Lee K and Rosati R. Evaluating the yield of medical tests. \emph{JAMA: The Journal of the American Medical Association} 1982; 247(18):2543-2546. + +Harrell F, Lee K, Califf R, Pryor D and Rosati R. Regression modeling strategies for +improved prognostic prediction. \emph{Statistics in Medicine} 1984; 3(2):143-152. + +Xu R and O`Quigley J. A \eqn{R^2} type measure of dependence for proportional hazards models. \emph{Nonparametric Statistics} 1999; 12:83-107. + +O`Quigley J, Xu R and Stare J. Explained randomness in proportional hazards models. \emph{Statistics in Medicine} 2005; 24:479-489. + +} +%\author{ +%% ~~who you are~~ +%} +%\note{ +%% ~~further notes~~ +%} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{greyzone.func}} +} +\examples{ +#See Examples for greyzone.func in this package. +} +%\keyword{ } +%\keyword{ }% __ONLY ONE__ keyword per line diff --git a/man/genSurvData.Rd b/man/genSurvData.Rd new file mode 100644 index 0000000..44b4106 --- /dev/null +++ b/man/genSurvData.Rd @@ -0,0 +1,98 @@ +\name{genSurvData} +\alias{genSurvData} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Simulate Survival Outcome Given Marker Values +} +\description{ +This function simulates survival outcome with a fixed censoring rate based on a Weibull distribution given input values such as study recruitment period, patient marker values, their true risk groupings (1=high risk and 0=low risk), and true regression coefficients. +} +\usage{ +genSurvData(n, recruitment.yrs=2, baseline.hazard=365.25*5, shape=1, censoring.rate=0, + beta.continuous, beta.binary=0, x, xhigh, ran.seed) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{n}{ + Sample size +} + \item{recruitment.yrs}{ + Patient recruitment period in years (default=2) +} + \item{baseline.hazard}{ + Baseline hazard, which is the mean survival time (in days) when covariates=0 (default=365.25*5 days) + } + + \item{shape}{ + The shape parameter for the Weibull distribution (it is exponential when shape=1) +} + \item{censoring.rate}{ + Censoring rate +} + \item{beta.continuous}{ + A true regression coefficient that links the continuous marker values to the survival outcome +} + \item{beta.binary}{ + A true regression coefficient that links the high risk group to the survival outcome +} + \item{x}{ + A nx1 vector for the marker values +} + \item{xhigh}{ + A nx1 vector of 1s and 0s indicating patient true risk identities (1=high risk and 0=low risk) +} + \item{ran.seed}{ + Seed number for random number generation +} +} +\details{ +The function can be used to generate survival data if you do not have any to try the grey-zone model. +} +\value{ +It returns a list with two components: the simulated survival data in days and the final censoring rate (which should be the same as the input censoring rate). +} + +%\references{ +%% ~put references to the literature/web site here ~ +%} +%\author{ +%% ~~who you are~~ +%} +%\note{ +%% ~~further notes~~ +%} +%% ~Make other sections like Warning with \section{Warning }{....} ~ +\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +\code{\link{em.func}}, \code{\link{cov.func}}, \code{\link{greyzone.func}} +} +\examples{ +## Generate package data called "mydata" +## Simulate high/low risk groupings, continuous marker values for each group, and survival data +## so that the higher maker values correspond to shorter survival. +n=300 +censoring.rate=0.3 +rate.lrisk=0.7 #rate of low risk +n.lrisk=n*rate.lrisk +n.hrisk=n-n.lrisk +mu=3 +beta.continuous=0.5 +beta.binary=0.5 +ran.seed=1000 +set.seed(ran.seed) +x0=rnorm(n.lrisk, 0, 1) #low risk patients have marker values distributed as Normal(0,1) +set.seed(ran.seed) +x1=rnorm(n.hrisk, mu, 1) #high risk patients have maker values distributed as Normal(mu,1) +score=c(x0, x1) +score.high=c(rep(0, n.lrisk), rep(1, n.hrisk)) +mydata=genSurvData(n=n, censoring.rate=censoring.rate, + beta.continuous=beta.continuous, beta.binary=beta.binary, + x=score, xhigh=score.high, ran.seed=ran.seed)$data +dim(mydata) +head(mydata) + +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +%\keyword{ ~kwd1 } +%\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/man/greyzoneSurv-package.Rd b/man/greyzoneSurv-package.Rd new file mode 100644 index 0000000..6c12cea --- /dev/null +++ b/man/greyzoneSurv-package.Rd @@ -0,0 +1,44 @@ +\name{greyzoneSurv-package} +\alias{greyzoneSurv-package} +\docType{package} +\title{ +\packageTitle{greyzoneSurv} +%Fit a Grey-Zone Model with Survival Data +} +\description{ +\packageDescription{greyzoneSurv} +%Useful when one wants to classify patients into low, intermediate, and high risk groups for disease progression based on a continuous marker that is associated %with progression-free survival. +} +\details{ +%\tabular{ll}{ +%Package: \tab greyzoneSurv\cr +%Type: \tab Package\cr +%Version: \tab 1.0\cr +%Date: \tab 2015-05-14\cr +%License: \tab What license is it under?\cr +%} +To fit the grey-zone model, one would need to call the functions in the order of \code{\link{em.func}}, \code{\link{cov.func}}, and \code{\link{greyzone.func}}. + +The package also provides a function \code{\link{bestcut2}} to fit a 2-group model, that is, it will find an optimal cutoff of the marker to divide patients into high and low 2 risk groups. Plus there is a function \code{\link{genSurvData}} to generate survival data with a fixed censoring rate. +} +\author{ +\packageAuthor{greyzoneSurv} + +Maintainer: \packageMaintainer{greyzoneSurv} +%Pingping Qu and John Crowley +%Maintainer: Pingping Qu +} + +\references{ +Pingping Qu, Bart Barlogie and John Crowley (2015) "Using a Latent Class Model to Refine Risk Stratification in Multiple Myeloma" (under review) +} +%~~ Optionally other standard keywords, one per line, from ~~ +%~~ file KEYWORDS in the R documentation directory ~~ +%\keyword{ package } +%\seealso{ +%~~ Optional links to other man pages, e.g. ~~ +%~~ \code{\link[:-package]{}} ~~ +%\} +%\examples{ +%~~ simple examples of the most important functions ~~ +%} diff --git a/man/mydata.Rd b/man/mydata.Rd new file mode 100644 index 0000000..73049fb --- /dev/null +++ b/man/mydata.Rd @@ -0,0 +1,33 @@ +\name{mydata} +\alias{mydata} +\docType{data} +\title{ +Package Data +} +\description{ +The data set can be used to test the grey-zone and 2-group models. +%% ~~ A concise (1-5 lines) description of the dataset. ~~ +} +\usage{data(mydata)} +\format{ +The data set is a data frame with 300 rows and 4 columns for patient survival time (time), censoring event (event), marker values (x), and true risk groups (xhigh=1 for high risk and 0 for low risk). +% The format is: +% chr "survAUC" +} +\details{ +%% ~~ If necessary, more details than the __description__ above ~~ +The data were generated based on a Weibull distribution for survival times and normal distributions for marker values in both the high and low risk patients. + +} +\source{ +The data were generated with the code in the Examples of \code{\link{genSurvData}} in this package. +%% ~~ reference to a publication or URL from which the data were obtained ~~ +} + +%\references{ +%% ~~ possibly secondary sources and usages ~~ +%} +\examples{ +data(mydata) +} +%\keyword{Weibull}