From 050d61d85d413b6da8b4a1087dbb8d92d32e67f7 Mon Sep 17 00:00:00 2001 From: ejolly Date: Sat, 9 Dec 2017 02:17:39 -0500 Subject: [PATCH] Refactored Brain_Data regression as separate stats function. Refactored robust estimators. --- nltools/data/brain_data.py | 143 +++--------------------- nltools/stats.py | 222 ++++++++++++++++++++++++++++--------- 2 files changed, 189 insertions(+), 176 deletions(-) diff --git a/nltools/data/brain_data.py b/nltools/data/brain_data.py index 69d5f2a7..29082be6 100644 --- a/nltools/data/brain_data.py +++ b/nltools/data/brain_data.py @@ -67,10 +67,8 @@ zscore, make_cosine_basis, transform_pairwise, - _robust_estimator_hc0, - _robust_estimator_hc3, - _robust_estimator_hac, - summarize_bootstrap) + summarize_bootstrap, + regress) from nltools.pbs_job import PBS_Job from .adjacency import Adjacency from nltools.prefs import MNI_Template, resolve_mni_path @@ -364,17 +362,20 @@ def plot(self, limit=5, anatomical=None, **kwargs): draw_cross=False, **kwargs) - def regress(self,robust=False,nlags=1): - """ Run vectorized OLS regression across voxels with optional robust estimation of standard errors (i.e. sandwich estimators). Robust estimators do not change values of beta coefficients. + def regress(self,mode='ols',**kwargs): + """ Run a mass-univariate regression across voxels. Three types of regressions can be run: + 1) Standard OLS (default) + 2) Robust OLS (heteroscedasticty and/or auto-correlation robust errors), i.e. OLS with "sandwich estimators" + 3) ARMA (auto-regressive and moving-average lags = 1 by default) - 3 robust estimators are implemented: - 'hc0': original Huber (1980) sandwich estimator - 'hc3': more robust MacKinnon & White (1985) estimator, better for smaller samples - 'hac': hc0 + robustness to serial auto-correlation Newey & West (1987) + For more information see the help for nltools.stats.regress + + ARMA notes: This mode is similar to AFNI's 3dREMLFit but without spatial smoothing of voxel auto-correlation estimates. It can be **very computationally intensive** so parallelization is used by default to try to speed things up. Speed is limited because a unique ARMA model is fit to *each voxel* (like AFNI/FSL), but unlike SPM, which assumes the same AR parameters (~0.2) at each voxel. While coefficient results are typically very similar to OLS, std-errors and so t-stats, dfs and and p-vals can differ greatly depending on how much auto-correlation is explaining the response in a voxel + relative to other regressors in the design matrix. Args: - robust (str): Estimate heteroscedasticity-consistent standard errors; 'HC0', 'HC3','HAC'; default is False - nlags (int): lag to use for HAC estimator, ignored otherwise; default is 1 + mode (str): kind of model to fit; must be one of 'ols' (default), 'robust', or 'arma' + kwargs (dict): keyword arguments to nltools.stats.regress Returns: out: dictionary of regression statistics in Brain_Data instances @@ -392,39 +393,15 @@ def regress(self,robust=False,nlags=1): raise ValueError("self.X does not match the correct size of " "self.data") - b = np.dot(np.linalg.pinv(self.X), self.data) - res = self.data - np.dot(self.X, b) - - if robust: - # Robust estimators are in essence just variations on the theme of sandwich estimators, scaling or modifying the residuals calculation by some iterative factor - bread = np.linalg.pinv(np.dot(self.X.T,self.X)) - if robust == 'hc0': - axis_func = [_robust_estimator_hc0,0,res,self.X,bread] - elif robust == 'hc3': - axis_func = [_robust_estimator_hc3,0,res,self.X,bread] - elif robust == 'hac': - axis_func = [_robust_estimator_hac,0,res,self.X,bread,nlags] - else: - raise ValueError("Robust standard error must be one of hc0, hc3, or hac!") - - stderr = np.apply_along_axis(*axis_func) - - else: - sigma = np.std(res, axis=0, ddof=self.X.shape[1]) - stderr = np.sqrt(np.diag(np.linalg.pinv(np.dot(self.X.T,self.X))))[:,np.newaxis] * sigma[np.newaxis,:] - - # stderr = np.dot(np.matrix(np.diagonal(np.linalg.inv(np.dot(self.X.T, - # self.X)))**.5).T, np.matrix(sigma)) + b,t,p,df,res = regress(self.X,self.data,mode=mode,**kwargs) + sigma = np.std(res,axis=0,ddof=self.X.shape[1]) b_out = deepcopy(self) b_out.data = b t_out = deepcopy(self) - t_out.data = b / stderr - df = np.array([self.X.shape[0]-self.X.shape[1]] * t_out.data.shape[1]) + t_out.data = t p_out = deepcopy(self) - p_out.data = 2*(1-t.cdf(np.abs(t_out.data), df)) - - # Might want to not output this info + p_out.data = p df_out = deepcopy(self) df_out.data = df sigma_out = deepcopy(self) @@ -435,92 +412,6 @@ def regress(self,robust=False,nlags=1): return {'beta': b_out, 't': t_out, 'p': p_out, 'df': df_out, 'sigma': sigma_out, 'residual': res_out} - - def regress_arma(self,method='css-mle',n_jobs=-1,backend='threading',max_nbytes=1e8,verbose=True): - - """ - Fit an ARMA(1,1) model instead of standard OLS to all voxels. This method is very similar to - AFNI's 3dREMLfit, but does not employ spatial smoothing of voxel auto-correlation estimates. - - Coefficient results are typically identitical though vary a little because MLE vs closed-form - OLS can differ based on rounding and optimizer search settings. T-stats, std-errors, and p-vals - can differ greatly depending on how much auto-correlation is explaining the response in a voxel - relative to other regressors specified in design matrix. - - Current implementation is VERY COMPUTATIONALLY INTENSIVE as it relies on statsmodels - implementation which is slow. That's because this implementation is an optimization - problem AT EVERY VOXEL, similar to AFNI/FSL, but unlike SPM, which assumes same - AR param (~0.2) at each voxel. - - Parallelization is employed by default to speed up computation time, and input arguments - control how parallelization is handled by default. - - Args: - method (str): which loglikelihood to maximize: 'css' conditional sum of squares (fastest - but potentially least accurate), 'mle' exact maximum likelihood, (potentially - most accurate but probably overkill); 'css-mle' (default), like mle but with - starting values given by css solution first - n_jobs (int): how many cores or threads to use; default is all cores or 10 threads - backend (str): 'threading' (default) or 'multiprocessing'. Threading shares memory but utilizes a - single-core whereas multiprocessing can greatly increase memory usage but utilize - multiple cores. Multiprocessing is automatically configured to memory-map - (i.e. write to disk) in memory arrays greaters than 100mb to prevent hanging - and crashing. - max_nbytes (int): in-memory variable size limit before using memory-mapping, i.e. writing to disk - ignored if backend is 'threading' - verbose (bool): print voxel-wise progress statement - - Returns: - out: same output as Brain_Data.regress() - - - """ - - from statsmodels.tsa.arima_model import ARMA - from joblib import Parallel, delayed - - def _arma_func(self,voxid,method=method): - - """ - Fit an ARMA(1,1) model at a specific voxel. - """ - - model = ARMA(endog=self.data[:,voxid],exog=self.X.values,order=(1,1)) - res = model.fit(trend='nc',method=method,transparams=False, - maxiter=50,disp=-1,start_ar_lags=2) - - return (res.params[:-2], res.tvalues[:-2],res.pvalues[:-2],res.df_resid, res.resid) - - # Parallelize - if backend == 'threading' and n_jobs == -1: - n_jobs = 10 - par_for = Parallel(n_jobs=n_jobs,verbose=5,backend=backend,max_nbytes=max_nbytes) - out_arma = par_for(delayed(_arma_func)(self=self,voxid=i) for i in range(self.shape()[1])) - - # Return results in Brain_Data format consistent with .regress() - b_dat = np.column_stack([elem[0] for elem in out_arma]) - t_dat = np.column_stack([elem[1] for elem in out_arma]) - p_dat = np.column_stack([elem[2] for elem in out_arma]) - df_dat = np.array([elem[3] for elem in out_arma]) - res_dat = np.column_stack([elem[4] for elem in out_arma]) - - sigma = np.std(res_dat,axis=0,ddof=self.X.shape[1]) - sigma_out = deepcopy(self) - sigma_out.data = sigma - b_out = deepcopy(self) - b_out.data = b_dat - t_out = deepcopy(self) - t_out.data = t_dat - p_out = deepcopy(self) - p_out.data = p_dat - df_out = deepcopy(self) - df_out.data = df_dat - res_out = deepcopy(self) - res_out.data = res_dat - - return {'beta': b_out, 't': t_out, 'p': p_out, 'df': df_out, - 'sigma': sigma_out, 'residual': res_out} - def ttest(self, threshold_dict=None): """ Calculate one sample t-test across each voxel (two-sided) diff --git a/nltools/stats.py b/nltools/stats.py index f4bfe408..252d548a 100644 --- a/nltools/stats.py +++ b/nltools/stats.py @@ -24,10 +24,9 @@ 'two_sample_permutation', 'correlation_permutation', 'make_cosine_basis', - '_robust_estimator_hc0', - '_robust_estimator_hc3', - '_robust_estimator_hac', - 'summarize_bootstrap'] + '_robust_estimator', + 'summarize_bootstrap', + 'regress'] import numpy as np import pandas as pd @@ -38,7 +37,12 @@ import warnings import itertools from joblib import Parallel, delayed +import six +from nltools.utils import attempt_to_import +# Optional dependencies +ARMA = attempt_to_import('statsmodels.tsa.arima_model',name='ARMA', + fromlist=['ARMA']) def pearson(x, y): """ Correlates row vector x with each row vector in 2D array y. From neurosynth.stats.py - author: Tal Yarkoni @@ -581,76 +585,66 @@ def transform_pairwise(X, y): X_new[-1] = - X_new[-1] return np.asarray(X_new), np.asarray(y_new).ravel() -def _robust_estimator_hc0(vals,X,bread): +def _robust_estimator(vals,X,robust_estimator='hc0',nlags=1): """ - Huber (1980) sandwich estimator to return robust standard error estimates. + Computes robust sandwich estimators for standard errors used in OLS computation. Types include: + 'hc0': Huber (1980) sandwich estimator to return robust standard error estimates. + 'hc3': MacKinnon and White (1985) HC3 sandwich estimator. Provides more robustness in smaller samples than HC0 Long & Ervin (2000) + 'hac': Newey-West (1987) estimator for robustness to heteroscedasticity as well as serial auto-correlation at given lags. + Refs: https://www.wikiwand.com/en/Heteroscedasticity-consistent_standard_errors https://github.com/statsmodels/statsmodels/blob/master/statsmodels/regression/linear_model.py https://cran.r-project.org/web/packages/sandwich/vignettes/sandwich.pdf + https://www.stata.com/manuals13/tsnewey.pdf Args: vals (np.ndarray): 1d array of residuals X (np.ndarray): design matrix used in OLS, e.g. Brain_Data().X - bread (np.ndarray): result of (X.T * X)^-1 - Returns: - stderr (np.ndarray): 1d array of standard errors with length == X.shape[1] - """ - - V = np.diag(vals**2) - meat = np.dot(np.dot(X.T,V),X) - vcv = np.dot(np.dot(bread,meat),bread) - return np.sqrt(np.diag(vcv)) - -def _robust_estimator_hc3(vals,X,bread): - """ - MacKinnon and White (1985) HC3 sandwich estimator. Provides more robustness in smaller samples than HC0 Long & Ervin (2000) - Refs: https://cran.r-project.org/web/packages/sandwich/vignettes/sandwich.pdf + robust_estimator (str): estimator type, 'hc0' (default), 'hc3', or 'hac' + nlags (int): number of lags, only used with 'hac' estimator, default is 1 - Args: - vals (np.ndarray): 1d array of residuals - X (np.ndarray): design matrix used in OLS, e.g. Brain_Data().X - bread (np.ndarray): result of (X.T * X)^-1 Returns: stderr (np.ndarray): 1d array of standard errors with length == X.shape[1] """ - V = np.diag(vals**2)/(1-np.diag(np.dot(X,np.dot(bread,X.T))))**2 - meat = np.dot(np.dot(X.T,V),X) - vcv = np.dot(np.dot(bread,meat),bread) - return np.sqrt(np.diag(vcv)) + assert robust_estimator in ['hc0','hc3','hac'], "robust_estimator must be one of hc0, hc3 or hac" -def _robust_estimator_hac(vals,X,bread,nlags=1): - """ - Newey-West (1987) estimator for robustness to heteroscedasticity as well as serial auto-correlation at given lags. - Refs: https://www.stata.com/manuals13/tsnewey.pdf + # Make a sandwich! + # First we need bread + bread = np.linalg.pinv(np.dot(X.T,X)) - Args: - vals (np.ndarray): 1d array of residuals - X (np.ndarray): design matrix used in OLS, e.g. Brain_Data().X - bread (np.ndarray): result of (X.T * X)^-1 - lag (int): what lag to estimate; default is 1 - Returns: - stderr (np.ndarray): 1d array of standard errors with length == X.shape[1] - """ + # Then we need meat + if robust_estimator == 'hc0': + V = np.diag(vals**2) + meat = np.dot(np.dot(X.T,V),X) - weights = 1 - np.arange(nlags+1.)/(nlags+1.) + elif robust_estimator == 'hc3': + V = np.diag(vals**2)/(1-np.diag(np.dot(X,np.dot(bread,X.T))))**2 + meat = np.dot(np.dot(X.T,V),X) - #First compute lag 0 - V = np.diag(vals**2) - meat = weights[0] * np.dot(np.dot(X.T,V),X) + elif robust_estimator == 'hac': + weights = 1 - np.arange(nlags+1.)/(nlags+1.) - #Now loop over additional lags - for l in range(1, nlags+1): + #First compute lag 0 + V = np.diag(vals**2) + meat = weights[0] * np.dot(np.dot(X.T,V),X) - V = np.diag(vals[l:] * vals[:-l]) - meat_1 = np.dot(np.dot(X[l:].T,V),X[:-l]) - meat_2 = np.dot(np.dot(X[:-l].T,V),X[l:]) + #Now loop over additional lags + for l in range(1, nlags+1): - meat += weights[l] * (meat_1 + meat_2) + V = np.diag(vals[l:] * vals[:-l]) + meat_1 = np.dot(np.dot(X[l:].T,V),X[:-l]) + meat_2 = np.dot(np.dot(X[:-l].T,V),X[l:]) + meat += weights[l] * (meat_1 + meat_2) + + # Then we make a sandwich vcv = np.dot(np.dot(bread,meat),bread) + return np.sqrt(np.diag(vcv)) + + def summarize_bootstrap(data, save_weights=False): """ Calculate summary of bootstrap samples @@ -675,3 +669,131 @@ def summarize_bootstrap(data, save_weights=False): if save_weights: output['samples'] = data return output + +def regress(X,Y,mode='ols',**kwargs): + """ This is a flexible function to run several types of regression models provided X and Y numpy arrays. Y can be a 1d numpy array or 2d numpy array. In the latter case, results will be output with shape 1 x Y.shape[1], in other words fitting a separate regression model to each column of Y. + + Does NOT add an intercept automatically to the X matrix before fitting like some other software packages. This is left up to the user. + + This function can compute regression in 3 ways: + 1) Standard OLS + 2) OLS with robust sandwich estimators for standard errors. 3 robust types of estimators exist: + 1) 'hc1' - classic huber-white estimator robust to heteroscedasticity (default) + 2) 'hc3' - a variant on huber-white estimator slightly more conservative when sample sizes are small + 3) 'hac' - an estimator robust to both heteroscedasticity and auto-correlation; auto-correlation lag can be controlled with the 'nlags' keyword argument; default is 1 + 3) ARMA (auto-regressive moving-average) model. This model is fit through statsmodels.tsa.arima_model.ARMA, so more information about options can be found there. Any settings can be passed in as kwargs. By default fits a (1,1) model with starting lags of 2. This mode is **computationally intensive** and can take quite a while if Y has many columns. If Y is a 2d array joblib.Parallel is used for faster fitting by parallelizing fits across columns of Y. Parallelization can be controlled by passing in kwargs. Defaults to multi-threading using 10 separate threads, as threads don't require large arrays to be duplicated in memory. Defaults are also set to enable memory-mapping for very large arrays if backend='multiprocessing' to prevent crashes and hangs. Various levels of progress can be monitored using the 'disp' (statsmodels) and 'verbose' (joblib) keyword arguments with integer values > 0. + + Args: + X (ndarray): design matrix; assumes intercept is included + Y (ndarray): dependent variable array; if 2d, a model is fit to each column of Y separately + mode (str): kind of model to fit; must be one of 'ols' (default), 'robust', or 'arma' + robust_estimator (str,optional): kind of robust estimator to use if mode = 'robust'; default 'hc0' + nlags (int,optional): auto-correlation lag correction if mode = 'robust' and robust_estimator = 'hac'; default 1 + order (tuple,optional): auto-regressive and moving-average orders for mode = 'arma'; default (1,1) + kwargs (dict): additional keyword arguments to statsmodels.tsa.arima_model.ARMA and joblib.Parallel + + Returns: + b: coefficients + t: t-statistics (coef/sterr) + p : p-values + df: degrees of freedom + res: residuals + + Examples: + Standard OLS + + >>> results = regress(X,Y,mode='ols') + + Robust OLS with heteroscedasticity (hc0) robust standard errors + + >>> results = regress(X,Y,mode='robust') + + Robust OLS with heteroscedasticty and auto-correlation (with lag 2) robust standard errors + + >>> results = regress(X,Y,mode='robust',robust_estimator='hac',nlags=2) + + Auto-regressive mode with auto-regressive and moving-average lags = 1 + + >>> results = regress(X,Y,mode='arma',order=(1,1)) + + Auto-regressive model with auto-regressive lag = 2, moving-average lag = 3, and multi-processing instead of multi-threading using 8 cores (this can use a lot of memory if input arrays are very large!). + + >>> results = regress(X,Y,mode='arma',order=(2,3),backend='multiprocessing',n_jobs=8) + + """ + + if not isinstance(mode,six.string_types): + raise ValueError('mode must be a string') + + assert mode in ['ols','robust','arma'], "Mode must be one of 'ols','robust' or 'arma'" + + # Compute standard errors based on regression mode + if mode == 'ols' or mode == 'robust': + + b = np.dot(np.linalg.pinv(X),Y) + res = Y - np.dot(X,b) + + # Vanilla OLS + if mode == 'ols': + sigma = np.std(res,axis=0,ddof=X.shape[1]) + stderr = np.sqrt(np.diag(np.linalg.pinv(np.dot(X.T,X))))[:,np.newaxis] * sigma[np.newaxis,:] + + # OLS with robust sandwich estimator based standard-errors + elif mode == 'robust': + robust_estimator = kwargs.pop('robust_estimator','hc0') + nlags = kwargs.pop('nlags',1) + axis_func = [_robust_estimator,0,res,X,robust_estimator,nlags] + stderr = np.apply_along_axis(*axis_func) + + t = b / stderr + df = np.array([X.shape[0]-X.shape[1]] * t.shape[1]) + p = 2*(1-t.cdf(np.abs(t),df)) + + # ARMA regression + elif mode == 'arma': + n_jobs = kwargs.pop('n_jobs',-1) + backend = kwargs.pop('backend','threading') + max_nbytes = kwargs.pop('max_nbytes',1e8) + verbose = kwargs.pop('verbose',0) + + # Use function scope to get X and Y + def _arma_func(idx=None,**kwargs): + + """ + Fit an ARMA(p,q) model. If Y is a matrix and not a vector, expects an idx argument that refers to columns of Y. + """ + method = kwargs.pop('method','css-mle') + order = kwargs.pop('order',(1,1)) + + maxiter = kwargs.pop('maxiter',50) + disp = kwargs.pop('disp',-1) + start_ar_lags = kwargs.pop('start_ar_lags',order[0]+1) + transparams = kwargs.pop('transparams',False) + trend = kwargs.pop('trend','nc') + + if len(Y.shape) == 2: + model = ARMA(endog=Y[:,idx],exog=X,order=order) + else: + model = ARMA(endog=Y,exog=X,order=order) + res = model.fit(trend=trend,method=method,transparams=transparams, + maxiter=maxiter,disp=disp,start_ar_lags=start_ar_lags) + + return (res.params[:-2], res.tvalues[:-2],res.pvalues[:-2],res.df_resid, res.resid) + + # Parallelize if Y vector contains more than 1 column + if len(Y.shape) == 2: + if backend == 'threading' and n_jobs == -1: + n_jobs = 10 + par_for = Parallel(n_jobs=n_jobs,verbose=verbose,backend=backend,max_nbytes=max_nbytes) + out_arma = par_for(delayed(_arma_func)(idx=i,**kwargs) for i in range(Y.shape[-1])) + + b = np.column_stack([elem[0] for elem in out_arma]) + t = np.column_stack([elem[1] for elem in out_arma]) + p = np.column_stack([elem[2] for elem in out_arma]) + df = np.array([elem[3] for elem in out_arma]) + res = np.column_stack([elem[4] for elem in out_arma]) + + else: + b,t,p,df,res = _arma_func() + + return b, t, p, df, res