From 050d61d85d413b6da8b4a1087dbb8d92d32e67f7 Mon Sep 17 00:00:00 2001
From: ejolly <eshin.jolly@gmail.com>
Date: Sat, 9 Dec 2017 02:17:39 -0500
Subject: [PATCH] Refactored Brain_Data regression as separate stats function.
 Refactored robust estimators.

---
 nltools/data/brain_data.py | 143 +++---------------------
 nltools/stats.py           | 222 ++++++++++++++++++++++++++++---------
 2 files changed, 189 insertions(+), 176 deletions(-)

diff --git a/nltools/data/brain_data.py b/nltools/data/brain_data.py
index 69d5f2a7..29082be6 100644
--- a/nltools/data/brain_data.py
+++ b/nltools/data/brain_data.py
@@ -67,10 +67,8 @@
                            zscore,
                            make_cosine_basis,
                            transform_pairwise,
-                           _robust_estimator_hc0,
-                           _robust_estimator_hc3,
-                           _robust_estimator_hac,
-                           summarize_bootstrap)
+                           summarize_bootstrap,
+                           regress)
 from nltools.pbs_job import PBS_Job
 from .adjacency import Adjacency
 from nltools.prefs import MNI_Template, resolve_mni_path
@@ -364,17 +362,20 @@ def plot(self, limit=5, anatomical=None, **kwargs):
                                    draw_cross=False,
                                    **kwargs)
 
-    def regress(self,robust=False,nlags=1):
-        """ Run vectorized OLS regression across voxels with optional robust estimation of standard errors (i.e. sandwich estimators). Robust estimators do not change values of beta coefficients.
+    def regress(self,mode='ols',**kwargs):
+        """ Run a mass-univariate regression across voxels. Three types of regressions can be run:
+        1) Standard OLS (default)
+        2) Robust OLS (heteroscedasticty and/or auto-correlation robust errors), i.e. OLS with "sandwich estimators"
+        3) ARMA (auto-regressive and moving-average lags = 1 by default)
 
-        3 robust estimators are implemented:
-        'hc0': original Huber (1980) sandwich estimator
-        'hc3': more robust MacKinnon & White (1985) estimator, better for smaller samples
-        'hac': hc0 + robustness to serial auto-correlation Newey & West (1987)
+        For more information see the help for nltools.stats.regress
+
+        ARMA notes: This mode is similar to AFNI's 3dREMLFit but without spatial smoothing of voxel auto-correlation estimates. It can be **very computationally intensive** so parallelization is used by default to try to speed things up. Speed is limited because a unique ARMA model is fit to *each voxel* (like AFNI/FSL), but unlike SPM, which assumes the same AR parameters (~0.2) at each voxel. While coefficient results are typically very similar to OLS, std-errors and so t-stats, dfs and and p-vals can differ greatly depending on how much auto-correlation is explaining the response in a voxel
+        relative to other regressors in the design matrix.
 
         Args:
-            robust (str): Estimate heteroscedasticity-consistent standard errors; 'HC0', 'HC3','HAC'; default is False
-            nlags (int): lag to use for HAC estimator, ignored otherwise; default is 1
+            mode (str): kind of model to fit; must be one of 'ols' (default), 'robust', or 'arma'
+            kwargs (dict): keyword arguments to nltools.stats.regress
 
         Returns:
             out: dictionary of regression statistics in Brain_Data instances
@@ -392,39 +393,15 @@ def regress(self,robust=False,nlags=1):
             raise ValueError("self.X does not match the correct size of "
                              "self.data")
 
-        b = np.dot(np.linalg.pinv(self.X), self.data)
-        res = self.data - np.dot(self.X, b)
-
-        if robust:
-            # Robust estimators are in essence just variations on the theme of sandwich estimators, scaling or modifying the residuals calculation by some iterative factor
-            bread = np.linalg.pinv(np.dot(self.X.T,self.X))
-            if robust == 'hc0':
-                axis_func = [_robust_estimator_hc0,0,res,self.X,bread]
-            elif robust == 'hc3':
-                axis_func = [_robust_estimator_hc3,0,res,self.X,bread]
-            elif robust == 'hac':
-                axis_func = [_robust_estimator_hac,0,res,self.X,bread,nlags]
-            else:
-                raise ValueError("Robust standard error must be one of hc0, hc3, or hac!")
-
-            stderr = np.apply_along_axis(*axis_func)
-
-        else:
-            sigma = np.std(res, axis=0, ddof=self.X.shape[1])
-            stderr = np.sqrt(np.diag(np.linalg.pinv(np.dot(self.X.T,self.X))))[:,np.newaxis] * sigma[np.newaxis,:]
-
-            # stderr = np.dot(np.matrix(np.diagonal(np.linalg.inv(np.dot(self.X.T,
-            #                 self.X)))**.5).T, np.matrix(sigma))
+        b,t,p,df,res = regress(self.X,self.data,mode=mode,**kwargs)
+        sigma = np.std(res,axis=0,ddof=self.X.shape[1])
 
         b_out = deepcopy(self)
         b_out.data = b
         t_out = deepcopy(self)
-        t_out.data = b / stderr
-        df = np.array([self.X.shape[0]-self.X.shape[1]] * t_out.data.shape[1])
+        t_out.data = t
         p_out = deepcopy(self)
-        p_out.data = 2*(1-t.cdf(np.abs(t_out.data), df))
-
-        # Might want to not output this info
+        p_out.data = p
         df_out = deepcopy(self)
         df_out.data = df
         sigma_out = deepcopy(self)
@@ -435,92 +412,6 @@ def regress(self,robust=False,nlags=1):
         return {'beta': b_out, 't': t_out, 'p': p_out, 'df': df_out,
                 'sigma': sigma_out, 'residual': res_out}
 
-
-        def regress_arma(self,method='css-mle',n_jobs=-1,backend='threading',max_nbytes=1e8,verbose=True):
-
-            """
-            Fit an ARMA(1,1) model instead of standard OLS to all voxels. This method is very similar to
-            AFNI's 3dREMLfit, but does not employ spatial smoothing of voxel auto-correlation estimates.
-
-            Coefficient results are typically identitical though vary a little because MLE vs closed-form
-            OLS can differ based on rounding and optimizer search settings. T-stats, std-errors, and p-vals
-            can differ greatly depending on how much auto-correlation is explaining the response in a voxel
-            relative to other regressors specified in design matrix.
-
-            Current implementation is VERY COMPUTATIONALLY INTENSIVE as it relies on statsmodels
-            implementation which is slow. That's because this implementation is an optimization
-            problem AT EVERY VOXEL, similar to AFNI/FSL, but unlike SPM, which assumes same
-            AR param (~0.2) at each voxel.
-
-            Parallelization is employed by default to speed up computation time, and input arguments
-            control how parallelization is handled by default.
-
-            Args:
-                method (str): which loglikelihood to maximize: 'css' conditional sum of squares (fastest
-                              but potentially least accurate), 'mle' exact maximum likelihood, (potentially
-                              most accurate but probably overkill); 'css-mle' (default), like mle but with
-                              starting values given by css solution first
-                n_jobs (int): how many cores or threads to use; default is all cores or 10 threads
-                backend (str): 'threading' (default) or 'multiprocessing'. Threading shares memory but utilizes a
-                                single-core whereas multiprocessing can greatly increase memory usage but utilize
-                                multiple cores. Multiprocessing is automatically configured to memory-map
-                                (i.e. write to disk) in memory arrays greaters than 100mb to prevent hanging
-                                and crashing.
-                max_nbytes (int): in-memory variable size limit before using memory-mapping, i.e. writing to disk
-                                  ignored if backend is 'threading'
-                verbose (bool): print voxel-wise progress statement
-
-            Returns:
-                out: same output as Brain_Data.regress()
-
-
-            """
-
-            from statsmodels.tsa.arima_model import ARMA
-            from joblib import Parallel, delayed
-
-            def _arma_func(self,voxid,method=method):
-
-                """
-                Fit an ARMA(1,1) model at a specific voxel.
-                """
-
-                model = ARMA(endog=self.data[:,voxid],exog=self.X.values,order=(1,1))
-                res = model.fit(trend='nc',method=method,transparams=False,
-                                maxiter=50,disp=-1,start_ar_lags=2)
-
-                return (res.params[:-2], res.tvalues[:-2],res.pvalues[:-2],res.df_resid, res.resid)
-
-            # Parallelize
-            if backend == 'threading' and n_jobs == -1:
-                n_jobs = 10
-            par_for = Parallel(n_jobs=n_jobs,verbose=5,backend=backend,max_nbytes=max_nbytes)
-            out_arma = par_for(delayed(_arma_func)(self=self,voxid=i) for i in range(self.shape()[1]))
-
-            # Return results in Brain_Data format consistent with .regress()
-            b_dat = np.column_stack([elem[0] for elem in out_arma])
-            t_dat = np.column_stack([elem[1] for elem in out_arma])
-            p_dat = np.column_stack([elem[2] for elem in out_arma])
-            df_dat = np.array([elem[3] for elem in out_arma])
-            res_dat = np.column_stack([elem[4] for elem in out_arma])
-
-            sigma = np.std(res_dat,axis=0,ddof=self.X.shape[1])
-            sigma_out = deepcopy(self)
-            sigma_out.data = sigma
-            b_out = deepcopy(self)
-            b_out.data = b_dat
-            t_out = deepcopy(self)
-            t_out.data = t_dat
-            p_out = deepcopy(self)
-            p_out.data = p_dat
-            df_out = deepcopy(self)
-            df_out.data = df_dat
-            res_out = deepcopy(self)
-            res_out.data = res_dat
-
-            return {'beta': b_out, 't': t_out, 'p': p_out, 'df': df_out,
-                        'sigma': sigma_out, 'residual': res_out}
-
     def ttest(self, threshold_dict=None):
         """ Calculate one sample t-test across each voxel (two-sided)
 
diff --git a/nltools/stats.py b/nltools/stats.py
index f4bfe408..252d548a 100644
--- a/nltools/stats.py
+++ b/nltools/stats.py
@@ -24,10 +24,9 @@
             'two_sample_permutation',
             'correlation_permutation',
             'make_cosine_basis',
-            '_robust_estimator_hc0',
-            '_robust_estimator_hc3',
-            '_robust_estimator_hac',
-            'summarize_bootstrap']
+            '_robust_estimator',
+            'summarize_bootstrap',
+            'regress']
 
 import numpy as np
 import pandas as pd
@@ -38,7 +37,12 @@
 import warnings
 import itertools
 from joblib import Parallel, delayed
+import six
+from nltools.utils import attempt_to_import
 
+# Optional dependencies
+ARMA = attempt_to_import('statsmodels.tsa.arima_model',name='ARMA',
+                              fromlist=['ARMA'])
 def pearson(x, y):
     """ Correlates row vector x with each row vector in 2D array y.
     From neurosynth.stats.py - author: Tal Yarkoni
@@ -581,76 +585,66 @@ def transform_pairwise(X, y):
             X_new[-1] = - X_new[-1]
     return np.asarray(X_new), np.asarray(y_new).ravel()
 
-def _robust_estimator_hc0(vals,X,bread):
+def _robust_estimator(vals,X,robust_estimator='hc0',nlags=1):
     """
-    Huber (1980) sandwich estimator to return robust standard error estimates.
+    Computes robust sandwich estimators for standard errors used in OLS computation. Types include:
+    'hc0': Huber (1980) sandwich estimator to return robust standard error estimates.
+    'hc3': MacKinnon and White (1985) HC3 sandwich estimator. Provides more robustness in smaller samples than HC0 Long & Ervin (2000)
+    'hac': Newey-West (1987) estimator for robustness to heteroscedasticity as well as serial auto-correlation at given lags.
+
     Refs: https://www.wikiwand.com/en/Heteroscedasticity-consistent_standard_errors
     https://github.com/statsmodels/statsmodels/blob/master/statsmodels/regression/linear_model.py
     https://cran.r-project.org/web/packages/sandwich/vignettes/sandwich.pdf
+    https://www.stata.com/manuals13/tsnewey.pdf
 
     Args:
         vals (np.ndarray): 1d array of residuals
         X (np.ndarray): design matrix used in OLS, e.g. Brain_Data().X
-        bread (np.ndarray): result of (X.T * X)^-1
-    Returns:
-        stderr (np.ndarray): 1d array of standard errors with length == X.shape[1]
-    """
-
-    V = np.diag(vals**2)
-    meat = np.dot(np.dot(X.T,V),X)
-    vcv = np.dot(np.dot(bread,meat),bread)
-    return np.sqrt(np.diag(vcv))
-
-def _robust_estimator_hc3(vals,X,bread):
-    """
-    MacKinnon and White (1985) HC3 sandwich estimator. Provides more robustness in smaller samples than HC0 Long & Ervin (2000)
-    Refs: https://cran.r-project.org/web/packages/sandwich/vignettes/sandwich.pdf
+        robust_estimator (str): estimator type, 'hc0' (default), 'hc3', or 'hac'
+        nlags (int): number of lags, only used with 'hac' estimator, default is 1
 
-    Args:
-        vals (np.ndarray): 1d array of residuals
-        X (np.ndarray): design matrix used in OLS, e.g. Brain_Data().X
-        bread (np.ndarray): result of (X.T * X)^-1
     Returns:
         stderr (np.ndarray): 1d array of standard errors with length == X.shape[1]
     """
 
-    V = np.diag(vals**2)/(1-np.diag(np.dot(X,np.dot(bread,X.T))))**2
-    meat = np.dot(np.dot(X.T,V),X)
-    vcv = np.dot(np.dot(bread,meat),bread)
-    return np.sqrt(np.diag(vcv))
+    assert robust_estimator in ['hc0','hc3','hac'], "robust_estimator must be one of hc0, hc3 or hac"
 
-def _robust_estimator_hac(vals,X,bread,nlags=1):
-    """
-    Newey-West (1987) estimator for robustness to heteroscedasticity as well as serial auto-correlation at given lags.
-    Refs: https://www.stata.com/manuals13/tsnewey.pdf
+    # Make a sandwich!
+    # First we need bread
+    bread = np.linalg.pinv(np.dot(X.T,X))
 
-    Args:
-        vals (np.ndarray): 1d array of residuals
-        X (np.ndarray): design matrix used in OLS, e.g. Brain_Data().X
-        bread (np.ndarray): result of (X.T * X)^-1
-        lag (int): what lag to estimate; default is 1
-    Returns:
-        stderr (np.ndarray): 1d array of standard errors with length == X.shape[1]
-    """
+    # Then we need meat
+    if robust_estimator == 'hc0':
+        V = np.diag(vals**2)
+        meat = np.dot(np.dot(X.T,V),X)
 
-    weights = 1 - np.arange(nlags+1.)/(nlags+1.)
+    elif robust_estimator == 'hc3':
+        V = np.diag(vals**2)/(1-np.diag(np.dot(X,np.dot(bread,X.T))))**2
+        meat = np.dot(np.dot(X.T,V),X)
 
-    #First compute lag 0
-    V = np.diag(vals**2)
-    meat = weights[0] * np.dot(np.dot(X.T,V),X)
+    elif robust_estimator == 'hac':
+        weights = 1 - np.arange(nlags+1.)/(nlags+1.)
 
-    #Now loop over additional lags
-    for l in range(1, nlags+1):
+        #First compute lag 0
+        V = np.diag(vals**2)
+        meat = weights[0] * np.dot(np.dot(X.T,V),X)
 
-        V = np.diag(vals[l:] * vals[:-l])
-        meat_1 = np.dot(np.dot(X[l:].T,V),X[:-l])
-        meat_2 = np.dot(np.dot(X[:-l].T,V),X[l:])
+        #Now loop over additional lags
+        for l in range(1, nlags+1):
 
-        meat += weights[l] * (meat_1 + meat_2)
+            V = np.diag(vals[l:] * vals[:-l])
+            meat_1 = np.dot(np.dot(X[l:].T,V),X[:-l])
+            meat_2 = np.dot(np.dot(X[:-l].T,V),X[l:])
 
+            meat += weights[l] * (meat_1 + meat_2)
+
+    # Then we make a sandwich
     vcv = np.dot(np.dot(bread,meat),bread)
+
     return np.sqrt(np.diag(vcv))
 
+
+
 def summarize_bootstrap(data, save_weights=False):
     """ Calculate summary of bootstrap samples
 
@@ -675,3 +669,131 @@ def summarize_bootstrap(data, save_weights=False):
     if save_weights:
         output['samples'] = data
     return output
+
+def regress(X,Y,mode='ols',**kwargs):
+    """ This is a flexible function to run several types of regression models provided X and Y numpy arrays. Y can be a 1d numpy array or 2d numpy array. In the latter case, results will be output with shape 1 x Y.shape[1], in other words fitting a separate regression model to each column of Y.
+
+    Does NOT add an intercept automatically to the X matrix before fitting like some other software packages. This is left up to the user.
+
+    This function can compute regression in 3 ways:
+    1) Standard OLS
+    2) OLS with robust sandwich estimators for standard errors. 3 robust types of estimators exist:
+        1) 'hc1' - classic huber-white estimator robust to heteroscedasticity (default)
+        2) 'hc3' - a variant on huber-white estimator slightly more conservative when sample sizes are small
+        3) 'hac' - an estimator robust to both heteroscedasticity and auto-correlation; auto-correlation lag can be controlled with the 'nlags' keyword argument; default is 1
+    3) ARMA (auto-regressive moving-average) model. This model is fit through statsmodels.tsa.arima_model.ARMA, so more information about options can be found there. Any settings can be passed in as kwargs. By default fits a (1,1) model with starting lags of 2. This mode is **computationally intensive** and can take quite a while if Y has many columns.  If Y is a 2d array joblib.Parallel is used for faster fitting by parallelizing fits across columns of Y. Parallelization can be controlled by passing in kwargs. Defaults to multi-threading using 10 separate threads, as threads don't require large arrays to be duplicated in memory. Defaults are also set to enable memory-mapping for very large arrays if backend='multiprocessing' to prevent crashes and hangs. Various levels of progress can be monitored using the 'disp' (statsmodels) and 'verbose' (joblib) keyword arguments with integer values > 0.
+
+    Args:
+        X (ndarray): design matrix; assumes intercept is included
+        Y (ndarray): dependent variable array; if 2d, a model is fit to each column of Y separately
+        mode (str): kind of model to fit; must be one of 'ols' (default), 'robust', or 'arma'
+        robust_estimator (str,optional): kind of robust estimator to use if mode = 'robust'; default 'hc0'
+        nlags (int,optional): auto-correlation lag correction if mode = 'robust' and robust_estimator = 'hac'; default 1
+        order (tuple,optional): auto-regressive and moving-average orders for mode = 'arma'; default (1,1)
+        kwargs (dict): additional keyword arguments to statsmodels.tsa.arima_model.ARMA and joblib.Parallel
+
+    Returns:
+        b: coefficients
+        t: t-statistics (coef/sterr)
+        p : p-values
+        df: degrees of freedom
+        res: residuals
+
+    Examples:
+        Standard OLS
+
+        >>> results = regress(X,Y,mode='ols')
+
+        Robust OLS with heteroscedasticity (hc0) robust standard errors
+
+        >>> results = regress(X,Y,mode='robust')
+
+        Robust OLS with heteroscedasticty and auto-correlation (with lag 2) robust standard errors
+
+        >>> results = regress(X,Y,mode='robust',robust_estimator='hac',nlags=2)
+
+        Auto-regressive mode with auto-regressive and moving-average lags = 1
+
+        >>> results = regress(X,Y,mode='arma',order=(1,1))
+
+        Auto-regressive model with auto-regressive lag = 2, moving-average lag = 3, and multi-processing instead of multi-threading using 8 cores (this can use a lot of memory if input arrays are very large!).
+
+        >>> results = regress(X,Y,mode='arma',order=(2,3),backend='multiprocessing',n_jobs=8)
+
+    """
+
+    if not isinstance(mode,six.string_types):
+        raise ValueError('mode must be a string')
+
+    assert mode in ['ols','robust','arma'], "Mode must be one of 'ols','robust' or 'arma'"
+
+    # Compute standard errors based on regression mode
+    if mode == 'ols' or mode == 'robust':
+
+        b = np.dot(np.linalg.pinv(X),Y)
+        res = Y - np.dot(X,b)
+
+        # Vanilla OLS
+        if mode == 'ols':
+            sigma = np.std(res,axis=0,ddof=X.shape[1])
+            stderr = np.sqrt(np.diag(np.linalg.pinv(np.dot(X.T,X))))[:,np.newaxis] * sigma[np.newaxis,:]
+
+        # OLS with robust sandwich estimator based standard-errors
+        elif mode == 'robust':
+            robust_estimator = kwargs.pop('robust_estimator','hc0')
+            nlags = kwargs.pop('nlags',1)
+            axis_func = [_robust_estimator,0,res,X,robust_estimator,nlags]
+            stderr = np.apply_along_axis(*axis_func)
+
+        t = b / stderr
+        df = np.array([X.shape[0]-X.shape[1]] * t.shape[1])
+        p = 2*(1-t.cdf(np.abs(t),df))
+
+    # ARMA regression
+    elif mode == 'arma':
+        n_jobs = kwargs.pop('n_jobs',-1)
+        backend = kwargs.pop('backend','threading')
+        max_nbytes = kwargs.pop('max_nbytes',1e8)
+        verbose = kwargs.pop('verbose',0)
+
+        # Use function scope to get X and Y
+        def _arma_func(idx=None,**kwargs):
+
+            """
+            Fit an ARMA(p,q) model. If Y is a matrix and not a vector, expects an idx argument that refers to columns of Y.
+            """
+            method = kwargs.pop('method','css-mle')
+            order = kwargs.pop('order',(1,1))
+
+            maxiter = kwargs.pop('maxiter',50)
+            disp = kwargs.pop('disp',-1)
+            start_ar_lags = kwargs.pop('start_ar_lags',order[0]+1)
+            transparams = kwargs.pop('transparams',False)
+            trend = kwargs.pop('trend','nc')
+
+            if len(Y.shape) == 2:
+                model = ARMA(endog=Y[:,idx],exog=X,order=order)
+            else:
+                model = ARMA(endog=Y,exog=X,order=order)
+            res = model.fit(trend=trend,method=method,transparams=transparams,
+                            maxiter=maxiter,disp=disp,start_ar_lags=start_ar_lags)
+
+            return (res.params[:-2], res.tvalues[:-2],res.pvalues[:-2],res.df_resid, res.resid)
+
+        # Parallelize if Y vector contains more than 1 column
+        if len(Y.shape) == 2:
+            if backend == 'threading' and n_jobs == -1:
+                n_jobs = 10
+            par_for = Parallel(n_jobs=n_jobs,verbose=verbose,backend=backend,max_nbytes=max_nbytes)
+            out_arma = par_for(delayed(_arma_func)(idx=i,**kwargs) for i in range(Y.shape[-1]))
+
+            b = np.column_stack([elem[0] for elem in out_arma])
+            t = np.column_stack([elem[1] for elem in out_arma])
+            p = np.column_stack([elem[2] for elem in out_arma])
+            df = np.array([elem[3] for elem in out_arma])
+            res = np.column_stack([elem[4] for elem in out_arma])
+
+        else:
+            b,t,p,df,res = _arma_func()
+
+        return b, t, p, df, res