Skip to content
Permalink
master
Switch branches/tags
Go to file
 
 
Cannot retrieve contributors at this time
executable file 356 lines (354 sloc) 14.8 KB
### Author of original python script of ARSER: Rendong Yang
### Email: cauyrd@gmail.com
### Associated literature: Rendong Yang and Zhen Su. Bioinformatics. 26(12):i168-74 (2010).
### Website: http://bioinformatics.cau.edu.cn/ARSER/
###======================================================================================================================================
detrend_linear<-function(y)
{
x=0:(length(y)-1); ##'y' should be a vector type.
C=cov(cbind(x,y))*(length(y)-1)/length(y); ##'(length(y)-1)/length(y)' is used to keep the value as same at that calculated by 'cov(x,y,bias=1)' in the original colde of ARSER.
b=C[1,2]/C[1,1];
a=mean(y) - b*mean(x);
return ( y- (b*x+a) );
}
###-----------------------------------------------------------------------------
savitzky_golay <- function(data, kernel=11, order=4) ###applies a Savitzky-Golay filter
{
##input parameters, data => data as a vector type; kernel => a positiv integer > 2*order giving the kernel size; order => order of the polynomal; returns smoothed data as a vector type
##-----------------------
if ( (round(kernel)!=kernel) | (round(order)!=order) | (kernel<1) | (order<1) )
{ stop("The input values of kernel and order in 'savitzky_golay' should be positive integers."); }
if (kernel %% 2 != 1)
{ stop("The input value of kernel in 'savitzky_golay' should be an odd number"); }
if (kernel < order+2)
{ stop("The kernel value is to small for the polynomals, and it should be larger than (order+2)"); }
##-----------------------
order_range=0:order; #a second order polynomal has 3 coefficients
half_window=(kernel-1)%/%2;
b="b";
for (k in (-half_window):(half_window) )
{
for (i in order_range)
{
if (b[1]=="b") {
b=k**i;
} else {
b=c(b,k**i);
}
}
}
b=matrix(b,nrow=half_window*2+1,ncol=length(order_range),byrow=TRUE);
##-----------------------
## the "MPinv()" function based on "gnm" package in R can calculate the "Moore-Penrose pseudo-inverse matrix" as "numpy.linalg.pinv" in python
# library(gnm);
m=MPinv(b);
m=m[1,];
window_size=length(m);
half_window=(window_size-1)%/%2;
offsets=(-half_window):half_window; #pre-compute the offset values for better performance
offset_data=cbind(offsets,m);
firstval=data[1]; #temporary data, extended with a mirror image to the left and right
lastval=data[length(data)];
leftpad=rep(0,half_window)+2*firstval;
rightpad=rep(0,half_window)+2*lastval;
leftchunk=data[2:(1+half_window)];
leftpad=leftpad-rev(leftchunk);
rightchunk=data[(length(data)-half_window):(length(data)-1)];
rightpad=rightpad-rev(rightchunk);
data=c(leftpad,data);
data=c(data,rightpad);
##-----------------------
smooth_data="smooth_data";
for (i in half_window:(length(data) -half_window -1) )
{
value=0;
for (j in 1:nrow(offset_data))
{
offset=offset_data[j,1];
weight=offset_data[j,2];
value=value+weight*data[i+offset+1];
}
smooth_data=c(smooth_data,value);
}
smooth_data=smooth_data[2:length(smooth_data)];
smooth_data=as.numeric(smooth_data);
return (smooth_data);
}
###-----------------------------------------------------------------------------
###calculate model log-likelihood and two information criteria(AIC and BIC criterion values)
LL<-function(e,nobs,ncoef)
{
LL= -(nobs*1/2)*(1+log(2*pi)) - (nobs/2)*log((e%*%e)/nobs);
aic= -2*LL/nobs + (2*ncoef/nobs);
bic= -2*LL/nobs + (ncoef*log(nobs))/nobs;
LL.out=c(LL,aic,bic);
names(LL.out)<-c("ll","aic","bic");
return (LL.out);
}
###-----------------------------------------------------------------------------
###multi-variate regression using OLS
OLS<-function(x, y, x_varnm='', y_varnm ='y')
{
## y = dependent variable; y is a vector type;
## y_varnm = string with the variable label for y;y_varnm is a vector type containing one string.
## x = independent variables, note that a constant is added by default;x is a matrix type.
## x_varnm = string or list of variable labels for the columns of x; x_varnm is a vector type.
##-----------------------
self.y=y;
self.y_varnm=y_varnm;
self.x=cbind(rep(1,nrow(x)),x);
self.x_varnm=c("const",x_varnm);
dimnames(self.x)[[2]]<-self.x_varnm;
self.xT=t(self.x);
self.nobs=length(self.y);
self.ncoef=ncol(self.x);
##-----------------------
self.inv_xx=solve( self.xT%*%self.x );
xy=self.xT%*%self.y;
self.b=self.inv_xx%*%xy; #change 1-column matrix to vector type
self.b=as.vector(self.b);
self.df_e=self.nobs - self.ncoef;
self.df_r = self.ncoef - 1;
self.e=self.y - self.x%*%self.b;
self.e=as.vector(self.e) #change 1-column matrix to vector type
self.sse=(self.e%*%self.e)/self.df_e;
self.sse=as.vector(self.sse); #change 1-column matrix to vector type
self.se=sqrt( diag(self.sse*self.inv_xx) );
self.t=self.b/self.se;
self.p=( 1 - pt(abs(self.t),self.df_e) )*2;
e.adfac=(length(self.e)-1)/length(self.e);
y.adfac=(length(self.y)-1)/length(self.y);
self.R2=1-(var(self.e)*e.adfac)/(var(self.y)*y.adfac); #warning: self.e and self.y should be vector type;
self.R2adj=1-(1-self.R2)*((self.nobs-1)/(self.nobs-self.ncoef));
self.F=(self.R2/self.df_r) / ((1-self.R2)/self.df_e);
self.Fpv=1-pf(self.F,self.df_r, self.df_e);
self.LL<-LL(e=self.e,nobs=self.nobs,ncoef=self.ncoef);
OLS.out=list("b"=self.b,"R2"=self.R2,"R2adj"=self.R2adj,"F"=self.F,"Fpv"=self.Fpv,
"ll"=self.LL["ll"],"aic"=self.LL["aic"],"bic"=self.LL["bic"],"t"=self.t,"tp"=self.p);
return (OLS.out);
}
###======================================================================================================================================
###estimate possible cycling period of time series by AR spectral estimation
###is_filter=True, stands for using smoothed time-series values by HR
get_period<-function(per.x,per.dt_y,per.delta,per.pID="",is_filter=TRUE,ar_method="mle")
{
self.x=per.x;
self.dt_y=per.dt_y;
self.delta=per.delta;
num_freq_mese=500;
set_order=24/self.delta; #set the parameter of 'order' (the order of the AR model to be fitted) in spec.ar()
if (set_order == length(self.x)) #If the expression profiles only cover one day[24/interval==length(time points)], 'order' is set to length(time points)/2;
{ set_order=length(self.x)/2; }
filter_y<-try(savitzky_golay(self.dt_y),silent=TRUE);
if (inherits(filter_y,"try-error"))
{
filter_y=savitzky_golay(self.dt_y, kernel=5, order=2);
}
flag=1;
if (is_filter)
{
mese=try(spec.ar(filter_y,n.freq=num_freq_mese, order=set_order,method=ar_method,plot=FALSE),silent=TRUE);
if ( inherits(mese,"try-error") )
{
mese=NA;
flag=NA;
}
} else {
mese=try(spec.ar(self.dt_y,n.freq=num_freq_mese, order=set_order,method=ar_method,plot=FALSE),silent=TRUE);
if (inherits(mese,"try-error"))
{
mese=NA;
flag=NA;
}
}
if (is.na(flag))
{
periods=NA;
} else {
peaks_loc=c(NA,0)
im1 <- mese$spec[1:(num_freq_mese-2)]
iv <- mese$spec[2:(num_freq_mese-1)]
ip1 <- mese$spec[3:(num_freq_mese)]
idx <- (iv > ip1) & (iv > im1)
vec <- cbind(iv[idx],which(idx)+1)
if(nrow(vec)>0) peaks_loc <- rbind(peaks_loc,vec)
if (is.vector(peaks_loc)) {
periods=NA;
} else {
peaks_loc=peaks_loc[2:nrow(peaks_loc),];
if ( is.vector(peaks_loc) )
{ peaks_loc=matrix(peaks_loc,nrow=1,ncol=2); }
peaks_loc.sort=peaks_loc[1:nrow(peaks_loc),1];
names(peaks_loc.sort)=peaks_loc[1:nrow(peaks_loc),2];
peaks_loc.sort=sort(peaks_loc.sort,decreasing=TRUE);
periods=try(1/mese$freq[as.numeric(names(peaks_loc.sort))]*self.delta,silent=TRUE);
if ( inherits(periods,"try-error") )
{ periods=NA; }
}
}
return (periods);
}
###-----------------------------------------------------------------------------
harmonic_regression<-function(har.x,har.dt_y,period)
{
##dt_y = mesor + sigma( A*cos(2*pi/T*x) + B*sin(2*pi/T*x) ) + error
self.x=har.x;
self.dt_y=har.dt_y;
x=self.x;
x_varnm_names=NA;
for (T in period)
{
cosx=cos(2*pi/T*self.x);
sinx=sin(2*pi/T*self.x);
x=cbind(x,cosx,sinx);
x_varnm_names=c(x_varnm_names,paste("cos",round(T,2),sep=""),paste("sin",round(T,2),sep=""));
}
x=x[,2:ncol(x)];
x_varnm_names=x_varnm_names[2:length(x_varnm_names)];
model = OLS(x=x, y=self.dt_y,x_varnm = x_varnm_names,y_varnm = 'y');
return (model);
}
###-----------------------------------------------------------------------------
###evaluate the best model for each time series
evaluate<-function(eva.x,eva.y,eva.delta,eva.pID="",T_start=20, T_end=28, T_default=24,arsmethods=c("yule-walker","mle","burg"))
{
eva.dt_y=detrend_linear(eva.y);
is_filter=c(TRUE, FALSE);
ar_method=arsmethods;
best_model=list("AIC"=1e6,"OLS"=NA,"period"=NA,"filter"=NA,"armethod"=NA);
for (p1 in is_filter)
{
for (p2 in ar_method)
{
period=get_period(per.x=eva.x,per.dt_y=eva.dt_y,per.delta=eva.delta,per.pID=eva.pID,is_filter=p1,ar_method=p2);
period=period[period >= T_start & period <= T_end];
if (!length(period))
{
p2="default";
period=T_default;
}
if (length(period)==1)
{
if (is.na(period))
{
p2="default";
period=T_default;
}
}
m=harmonic_regression(har.x=eva.x,har.dt_y=eva.dt_y,period=period);
aic=m$aic; #model selection by aic
if (aic <= best_model$AIC)
{
best_model$AIC=aic;
best_model$OLS=m;
best_model$period=period;
best_model$filter=p1;
best_model$armethod=p2;
}
}
}
##-----------------------
m=best_model$OLS; #record the best model parameters
self.estimate_period = best_model$period;
self.amplitude=NA;
self.phase=NA;
for (i in 1:length(self.estimate_period) )
{
phi=Arg(complex(real = m$b[2*i],imaginary = -m$b[2*i+1]));
if (phi <= 1e-6) {
self.phase=c(self.phase,abs(phi)/(2*pi)*self.estimate_period[i]);
} else {
self.phase=c(self.phase,(self.estimate_period[i] - phi/(2*pi)*self.estimate_period[i]) );
}
self.amplitude=c(self.amplitude,sqrt(m$b[2*i]**2 + m$b[2*i+1]**2) );
}
self.amplitude=self.amplitude[2:length(self.amplitude)];
self.phase=self.phase[2:length(self.phase)];
self.R2=m$R2;
self.R2adj=m$R2adj;
self.pvalue=m$Fpv;
py.std=sd(eva.y)*sqrt( (length(eva.y)-1)/length(eva.y) );
self.coef_var=py.std/mean(eva.y); #'py.std' is equal to 'np.std(eva.y)' in python.
##-----------------------
eva.out=list("period"=self.estimate_period,"amplitude"=self.amplitude,"phase"=self.phase,"R2"=self.R2,"R2adj"=self.R2adj,
"coefvar"=self.coef_var,"pvalue"=self.pvalue,"filter"=best_model$filter,"armethod"=best_model$armethod);
return (eva.out);
}
###======================================================================================================================================
runARS <- function(indata,ARStime,minper=20,maxper=28, arsper=24, arsmet="", releaseNote=TRUE, para = FALSE, ncores = 1)
{
#-----------------------
if (releaseNote) {
cat("The ARS is in process from ",format(Sys.time(), "%X %m-%d-%Y"),"\n");
}
start=minper;
end=maxper;
pvalues <-NA
#-----------------------
self.data=indata;
self.delta=ARStime[2]-ARStime[1];
time_points=ARStime;
idorder<-dimnames(self.data)[[1]];
dataM=as.matrix(self.data[,2:ncol(self.data)]);
outID=as.character(self.data[,1]);
names(outID)<-idorder;
dimnames(dataM)<-list("r"=idorder,"c"=paste("T",self.delta*(0:(ncol(dataM)-1)),sep="" ) );
#-----------------------
run_start=proc.time();
set.seed(run_start["elapsed"]);
header <- c("filter_type","ar_method","period_number","period","amplitude","phase","mean","R_square","R2_adjust","coef_var","pvalue");
ori.op <- options();
#-----------------------
##try 'apply()' in latter version, it may improve the Computational Efficiency
mainars <- function(y){
if (sd(y)!=0) {
d=evaluate(eva.x=time_points,eva.y=y,eva.delta=self.delta,eva.pID=line,T_start=start, T_end=end, T_default=arsper,arsmethods=arsmet);
ars.filter=0;
if (d$filter)
{ ars.filter=1; }
ars.period.num=length(d$period);
ars.period=paste(d$period,collapse=",");
ars.amplitude=paste(d$amplitude,collapse=",");
ars.phase=paste(d$phase,collapse=",");
ars.out=c(ars.filter,d$armethod,ars.period.num,ars.period,ars.amplitude,ars.phase,mean(y),d$R2,d$R2adj,d$coefvar,d$pvalue);
pvalues=c(pvalues,d$pvalue);
} else {
ars.out=c(rep(NA,4),0,NA,mean(y),rep(NA,3),1);
pvalues=c(pvalues,1); #assign it as '1' insead of 'NA' for avoiding error report in 'pi0.est' step
}
return(ars.out)
}
if(para){
tmpout <- parallel::mclapply(idorder,function(i) mainars(dataM[i,]), mc.cores = ncores)
ars.outM = do.call(rbind,tmpout)
}else{
ars.outM <- matrix(ncol=11,nrow=0)
for (line in idorder )
{
ars.out <- mainars(dataM[line,])
ars.outM=rbind(ars.outM,ars.out);
}
}
colnames(ars.outM) <- header
pvalues=ars.outM[,"pvalue"]
options(ori.op);
dimnames(ars.outM)[[1]]=idorder;
names(pvalues)=idorder;
#-----------------------
#header=c("CirID",header,"qvalue","fdr_BH"); ## qvalue is not used in the new version of ARSER
# pi0=pi0.est(pvalues); #It will report error when analyzing expression profile with equal values among all time points.
# if (pi0$p0 == 0) #p0 is required for calculating qvalues by qvalue.cal(); p0 will define the largest q-value calculated by qvalue.cal
# { pi0$p0=0.95; } #If p0 is '0', the calculated q-value is '0' for all pvalues, thus change it to 0.95 is a stategy to reduce false positive?
# qvalues=qvalue.cal(pvalues[idorder],pi0$p0);
header=c("CycID",header,"fdr_BH");
qvalues_BH=p.adjust(pvalues[idorder],"BH");
ARSoutM=cbind(outID[idorder],ars.outM[idorder,],
as.character(qvalues_BH)); # convert to char here keeps more precision
ARSoutM <- as.matrix(ARSoutM)
dimnames(ARSoutM)<-list("r"=idorder,"c"=header);
if (releaseNote) {
cat("The analysis by ARS is finished at ",format(Sys.time(), "%X %m-%d-%Y"),"\n");
}
return(ARSoutM);
}
###======================================================================================================================================