Skip to content

Commit

Permalink
Refactored Brain_Data regression as separate stats function. Refactor…
Browse files Browse the repository at this point in the history
…ed robust estimators.
  • Loading branch information
ejolly committed Dec 9, 2017
1 parent 32415d1 commit 050d61d
Show file tree
Hide file tree
Showing 2 changed files with 189 additions and 176 deletions.
143 changes: 17 additions & 126 deletions nltools/data/brain_data.py
Expand Up @@ -67,10 +67,8 @@
zscore,
make_cosine_basis,
transform_pairwise,
_robust_estimator_hc0,
_robust_estimator_hc3,
_robust_estimator_hac,
summarize_bootstrap)
summarize_bootstrap,
regress)
from nltools.pbs_job import PBS_Job
from .adjacency import Adjacency
from nltools.prefs import MNI_Template, resolve_mni_path
Expand Down Expand Up @@ -364,17 +362,20 @@ def plot(self, limit=5, anatomical=None, **kwargs):
draw_cross=False,
**kwargs)

def regress(self,robust=False,nlags=1):
""" Run vectorized OLS regression across voxels with optional robust estimation of standard errors (i.e. sandwich estimators). Robust estimators do not change values of beta coefficients.
def regress(self,mode='ols',**kwargs):
""" Run a mass-univariate regression across voxels. Three types of regressions can be run:
1) Standard OLS (default)
2) Robust OLS (heteroscedasticty and/or auto-correlation robust errors), i.e. OLS with "sandwich estimators"
3) ARMA (auto-regressive and moving-average lags = 1 by default)
3 robust estimators are implemented:
'hc0': original Huber (1980) sandwich estimator
'hc3': more robust MacKinnon & White (1985) estimator, better for smaller samples
'hac': hc0 + robustness to serial auto-correlation Newey & West (1987)
For more information see the help for nltools.stats.regress
ARMA notes: This mode is similar to AFNI's 3dREMLFit but without spatial smoothing of voxel auto-correlation estimates. It can be **very computationally intensive** so parallelization is used by default to try to speed things up. Speed is limited because a unique ARMA model is fit to *each voxel* (like AFNI/FSL), but unlike SPM, which assumes the same AR parameters (~0.2) at each voxel. While coefficient results are typically very similar to OLS, std-errors and so t-stats, dfs and and p-vals can differ greatly depending on how much auto-correlation is explaining the response in a voxel
relative to other regressors in the design matrix.
Args:
robust (str): Estimate heteroscedasticity-consistent standard errors; 'HC0', 'HC3','HAC'; default is False
nlags (int): lag to use for HAC estimator, ignored otherwise; default is 1
mode (str): kind of model to fit; must be one of 'ols' (default), 'robust', or 'arma'
kwargs (dict): keyword arguments to nltools.stats.regress
Returns:
out: dictionary of regression statistics in Brain_Data instances
Expand All @@ -392,39 +393,15 @@ def regress(self,robust=False,nlags=1):
raise ValueError("self.X does not match the correct size of "
"self.data")

b = np.dot(np.linalg.pinv(self.X), self.data)
res = self.data - np.dot(self.X, b)

if robust:
# Robust estimators are in essence just variations on the theme of sandwich estimators, scaling or modifying the residuals calculation by some iterative factor
bread = np.linalg.pinv(np.dot(self.X.T,self.X))
if robust == 'hc0':
axis_func = [_robust_estimator_hc0,0,res,self.X,bread]
elif robust == 'hc3':
axis_func = [_robust_estimator_hc3,0,res,self.X,bread]
elif robust == 'hac':
axis_func = [_robust_estimator_hac,0,res,self.X,bread,nlags]
else:
raise ValueError("Robust standard error must be one of hc0, hc3, or hac!")

stderr = np.apply_along_axis(*axis_func)

else:
sigma = np.std(res, axis=0, ddof=self.X.shape[1])
stderr = np.sqrt(np.diag(np.linalg.pinv(np.dot(self.X.T,self.X))))[:,np.newaxis] * sigma[np.newaxis,:]

# stderr = np.dot(np.matrix(np.diagonal(np.linalg.inv(np.dot(self.X.T,
# self.X)))**.5).T, np.matrix(sigma))
b,t,p,df,res = regress(self.X,self.data,mode=mode,**kwargs)
sigma = np.std(res,axis=0,ddof=self.X.shape[1])

b_out = deepcopy(self)
b_out.data = b
t_out = deepcopy(self)
t_out.data = b / stderr
df = np.array([self.X.shape[0]-self.X.shape[1]] * t_out.data.shape[1])
t_out.data = t
p_out = deepcopy(self)
p_out.data = 2*(1-t.cdf(np.abs(t_out.data), df))

# Might want to not output this info
p_out.data = p
df_out = deepcopy(self)
df_out.data = df
sigma_out = deepcopy(self)
Expand All @@ -435,92 +412,6 @@ def regress(self,robust=False,nlags=1):
return {'beta': b_out, 't': t_out, 'p': p_out, 'df': df_out,
'sigma': sigma_out, 'residual': res_out}


def regress_arma(self,method='css-mle',n_jobs=-1,backend='threading',max_nbytes=1e8,verbose=True):

"""
Fit an ARMA(1,1) model instead of standard OLS to all voxels. This method is very similar to
AFNI's 3dREMLfit, but does not employ spatial smoothing of voxel auto-correlation estimates.
Coefficient results are typically identitical though vary a little because MLE vs closed-form
OLS can differ based on rounding and optimizer search settings. T-stats, std-errors, and p-vals
can differ greatly depending on how much auto-correlation is explaining the response in a voxel
relative to other regressors specified in design matrix.
Current implementation is VERY COMPUTATIONALLY INTENSIVE as it relies on statsmodels
implementation which is slow. That's because this implementation is an optimization
problem AT EVERY VOXEL, similar to AFNI/FSL, but unlike SPM, which assumes same
AR param (~0.2) at each voxel.
Parallelization is employed by default to speed up computation time, and input arguments
control how parallelization is handled by default.
Args:
method (str): which loglikelihood to maximize: 'css' conditional sum of squares (fastest
but potentially least accurate), 'mle' exact maximum likelihood, (potentially
most accurate but probably overkill); 'css-mle' (default), like mle but with
starting values given by css solution first
n_jobs (int): how many cores or threads to use; default is all cores or 10 threads
backend (str): 'threading' (default) or 'multiprocessing'. Threading shares memory but utilizes a
single-core whereas multiprocessing can greatly increase memory usage but utilize
multiple cores. Multiprocessing is automatically configured to memory-map
(i.e. write to disk) in memory arrays greaters than 100mb to prevent hanging
and crashing.
max_nbytes (int): in-memory variable size limit before using memory-mapping, i.e. writing to disk
ignored if backend is 'threading'
verbose (bool): print voxel-wise progress statement
Returns:
out: same output as Brain_Data.regress()
"""

from statsmodels.tsa.arima_model import ARMA
from joblib import Parallel, delayed

def _arma_func(self,voxid,method=method):

"""
Fit an ARMA(1,1) model at a specific voxel.
"""

model = ARMA(endog=self.data[:,voxid],exog=self.X.values,order=(1,1))
res = model.fit(trend='nc',method=method,transparams=False,
maxiter=50,disp=-1,start_ar_lags=2)

return (res.params[:-2], res.tvalues[:-2],res.pvalues[:-2],res.df_resid, res.resid)

# Parallelize
if backend == 'threading' and n_jobs == -1:
n_jobs = 10
par_for = Parallel(n_jobs=n_jobs,verbose=5,backend=backend,max_nbytes=max_nbytes)
out_arma = par_for(delayed(_arma_func)(self=self,voxid=i) for i in range(self.shape()[1]))

# Return results in Brain_Data format consistent with .regress()
b_dat = np.column_stack([elem[0] for elem in out_arma])
t_dat = np.column_stack([elem[1] for elem in out_arma])
p_dat = np.column_stack([elem[2] for elem in out_arma])
df_dat = np.array([elem[3] for elem in out_arma])
res_dat = np.column_stack([elem[4] for elem in out_arma])

sigma = np.std(res_dat,axis=0,ddof=self.X.shape[1])
sigma_out = deepcopy(self)
sigma_out.data = sigma
b_out = deepcopy(self)
b_out.data = b_dat
t_out = deepcopy(self)
t_out.data = t_dat
p_out = deepcopy(self)
p_out.data = p_dat
df_out = deepcopy(self)
df_out.data = df_dat
res_out = deepcopy(self)
res_out.data = res_dat

return {'beta': b_out, 't': t_out, 'p': p_out, 'df': df_out,
'sigma': sigma_out, 'residual': res_out}

def ttest(self, threshold_dict=None):
""" Calculate one sample t-test across each voxel (two-sided)
Expand Down

0 comments on commit 050d61d

Please sign in to comment.