Refactored Brain_Data regression as separate stats function. Refactor…

…ed robust estimators.
cosanlab · Dec 9, 2017 · 050d61d · 050d61d
1 parent 32415d1
commit 050d61d
Show file tree

Hide file tree

Showing 2 changed files with 189 additions and 176 deletions.
diff --git a/nltools/data/brain_data.py b/nltools/data/brain_data.py
@@ -67,10 +67,8 @@
                            zscore,
                            make_cosine_basis,
                            transform_pairwise,
-                           _robust_estimator_hc0,
-                           _robust_estimator_hc3,
-                           _robust_estimator_hac,
-                           summarize_bootstrap)
+                           summarize_bootstrap,
+                           regress)
 from nltools.pbs_job import PBS_Job
 from .adjacency import Adjacency
 from nltools.prefs import MNI_Template, resolve_mni_path
@@ -364,17 +362,20 @@ def plot(self, limit=5, anatomical=None, **kwargs):
                                    draw_cross=False,
                                    **kwargs)
 
-    def regress(self,robust=False,nlags=1):
-        """ Run vectorized OLS regression across voxels with optional robust estimation of standard errors (i.e. sandwich estimators). Robust estimators do not change values of beta coefficients.
+    def regress(self,mode='ols',**kwargs):
+        """ Run a mass-univariate regression across voxels. Three types of regressions can be run:
+        1) Standard OLS (default)
+        2) Robust OLS (heteroscedasticty and/or auto-correlation robust errors), i.e. OLS with "sandwich estimators"
+        3) ARMA (auto-regressive and moving-average lags = 1 by default)
 
-        3 robust estimators are implemented:
-        'hc0': original Huber (1980) sandwich estimator
-        'hc3': more robust MacKinnon & White (1985) estimator, better for smaller samples
-        'hac': hc0 + robustness to serial auto-correlation Newey & West (1987)
+        For more information see the help for nltools.stats.regress
+
+        ARMA notes: This mode is similar to AFNI's 3dREMLFit but without spatial smoothing of voxel auto-correlation estimates. It can be **very computationally intensive** so parallelization is used by default to try to speed things up. Speed is limited because a unique ARMA model is fit to *each voxel* (like AFNI/FSL), but unlike SPM, which assumes the same AR parameters (~0.2) at each voxel. While coefficient results are typically very similar to OLS, std-errors and so t-stats, dfs and and p-vals can differ greatly depending on how much auto-correlation is explaining the response in a voxel
+        relative to other regressors in the design matrix.
 
         Args:
-            robust (str): Estimate heteroscedasticity-consistent standard errors; 'HC0', 'HC3','HAC'; default is False
-            nlags (int): lag to use for HAC estimator, ignored otherwise; default is 1
+            mode (str): kind of model to fit; must be one of 'ols' (default), 'robust', or 'arma'
+            kwargs (dict): keyword arguments to nltools.stats.regress
 
         Returns:
             out: dictionary of regression statistics in Brain_Data instances
@@ -392,39 +393,15 @@ def regress(self,robust=False,nlags=1):
             raise ValueError("self.X does not match the correct size of "
                              "self.data")
 
-        b = np.dot(np.linalg.pinv(self.X), self.data)
-        res = self.data - np.dot(self.X, b)
-
-        if robust:
-            # Robust estimators are in essence just variations on the theme of sandwich estimators, scaling or modifying the residuals calculation by some iterative factor
-            bread = np.linalg.pinv(np.dot(self.X.T,self.X))
-            if robust == 'hc0':
-                axis_func = [_robust_estimator_hc0,0,res,self.X,bread]
-            elif robust == 'hc3':
-                axis_func = [_robust_estimator_hc3,0,res,self.X,bread]
-            elif robust == 'hac':
-                axis_func = [_robust_estimator_hac,0,res,self.X,bread,nlags]
-            else:
-                raise ValueError("Robust standard error must be one of hc0, hc3, or hac!")
-
-            stderr = np.apply_along_axis(*axis_func)
-
-        else:
-            sigma = np.std(res, axis=0, ddof=self.X.shape[1])
-            stderr = np.sqrt(np.diag(np.linalg.pinv(np.dot(self.X.T,self.X))))[:,np.newaxis] * sigma[np.newaxis,:]
-
-            # stderr = np.dot(np.matrix(np.diagonal(np.linalg.inv(np.dot(self.X.T,
-            #                 self.X)))**.5).T, np.matrix(sigma))
+        b,t,p,df,res = regress(self.X,self.data,mode=mode,**kwargs)
+        sigma = np.std(res,axis=0,ddof=self.X.shape[1])
 
         b_out = deepcopy(self)
         b_out.data = b
         t_out = deepcopy(self)
-        t_out.data = b / stderr
-        df = np.array([self.X.shape[0]-self.X.shape[1]] * t_out.data.shape[1])
+        t_out.data = t
         p_out = deepcopy(self)
-        p_out.data = 2*(1-t.cdf(np.abs(t_out.data), df))
-
-        # Might want to not output this info
+        p_out.data = p
         df_out = deepcopy(self)
         df_out.data = df
         sigma_out = deepcopy(self)
@@ -435,92 +412,6 @@ def regress(self,robust=False,nlags=1):
         return {'beta': b_out, 't': t_out, 'p': p_out, 'df': df_out,
                 'sigma': sigma_out, 'residual': res_out}
 
-
-        def regress_arma(self,method='css-mle',n_jobs=-1,backend='threading',max_nbytes=1e8,verbose=True):
-
-            """
-            Fit an ARMA(1,1) model instead of standard OLS to all voxels. This method is very similar to
-            AFNI's 3dREMLfit, but does not employ spatial smoothing of voxel auto-correlation estimates.
-
-            Coefficient results are typically identitical though vary a little because MLE vs closed-form
-            OLS can differ based on rounding and optimizer search settings. T-stats, std-errors, and p-vals
-            can differ greatly depending on how much auto-correlation is explaining the response in a voxel
-            relative to other regressors specified in design matrix.
-
-            Current implementation is VERY COMPUTATIONALLY INTENSIVE as it relies on statsmodels
-            implementation which is slow. That's because this implementation is an optimization
-            problem AT EVERY VOXEL, similar to AFNI/FSL, but unlike SPM, which assumes same
-            AR param (~0.2) at each voxel.
-
-            Parallelization is employed by default to speed up computation time, and input arguments
-            control how parallelization is handled by default.
-
-            Args:
-                method (str): which loglikelihood to maximize: 'css' conditional sum of squares (fastest
-                              but potentially least accurate), 'mle' exact maximum likelihood, (potentially
-                              most accurate but probably overkill); 'css-mle' (default), like mle but with
-                              starting values given by css solution first
-                n_jobs (int): how many cores or threads to use; default is all cores or 10 threads
-                backend (str): 'threading' (default) or 'multiprocessing'. Threading shares memory but utilizes a
-                                single-core whereas multiprocessing can greatly increase memory usage but utilize
-                                multiple cores. Multiprocessing is automatically configured to memory-map
-                                (i.e. write to disk) in memory arrays greaters than 100mb to prevent hanging
-                                and crashing.
-                max_nbytes (int): in-memory variable size limit before using memory-mapping, i.e. writing to disk
-                                  ignored if backend is 'threading'
-                verbose (bool): print voxel-wise progress statement
-
-            Returns:
-                out: same output as Brain_Data.regress()
-
-
-            """
-
-            from statsmodels.tsa.arima_model import ARMA
-            from joblib import Parallel, delayed
-
-            def _arma_func(self,voxid,method=method):
-
-                """
-                Fit an ARMA(1,1) model at a specific voxel.
-                """
-
-                model = ARMA(endog=self.data[:,voxid],exog=self.X.values,order=(1,1))
-                res = model.fit(trend='nc',method=method,transparams=False,
-                                maxiter=50,disp=-1,start_ar_lags=2)
-
-                return (res.params[:-2], res.tvalues[:-2],res.pvalues[:-2],res.df_resid, res.resid)
-
-            # Parallelize
-            if backend == 'threading' and n_jobs == -1:
-                n_jobs = 10
-            par_for = Parallel(n_jobs=n_jobs,verbose=5,backend=backend,max_nbytes=max_nbytes)
-            out_arma = par_for(delayed(_arma_func)(self=self,voxid=i) for i in range(self.shape()[1]))
-
-            # Return results in Brain_Data format consistent with .regress()
-            b_dat = np.column_stack([elem[0] for elem in out_arma])
-            t_dat = np.column_stack([elem[1] for elem in out_arma])
-            p_dat = np.column_stack([elem[2] for elem in out_arma])
-            df_dat = np.array([elem[3] for elem in out_arma])
-            res_dat = np.column_stack([elem[4] for elem in out_arma])
-
-            sigma = np.std(res_dat,axis=0,ddof=self.X.shape[1])
-            sigma_out = deepcopy(self)
-            sigma_out.data = sigma
-            b_out = deepcopy(self)
-            b_out.data = b_dat
-            t_out = deepcopy(self)
-            t_out.data = t_dat
-            p_out = deepcopy(self)
-            p_out.data = p_dat
-            df_out = deepcopy(self)
-            df_out.data = df_dat
-            res_out = deepcopy(self)
-            res_out.data = res_dat
-
-            return {'beta': b_out, 't': t_out, 'p': p_out, 'df': df_out,
-                        'sigma': sigma_out, 'residual': res_out}
-
     def ttest(self, threshold_dict=None):
         """ Calculate one sample t-test across each voxel (two-sided)