Merge pull request #152 from kyleabeauchamp/cleaner

Switched to int64, moved logsumexp to utils.
choderalab · Jan 10, 2015 · 10de035 · 10de035
2 parents 90ee48c + 62ed5dc
commit 10de035
Show file tree

Hide file tree

Showing 7 changed files with 121 additions and 113 deletions.
diff --git a/pymbar/bar.py b/pymbar/bar.py
@@ -49,7 +49,7 @@
 #=============================================================================================
 import numpy as np
 import numpy.linalg
-from pymbar.utils import _logsum, ParameterError, ConvergenceError, BoundsError
+from pymbar.utils import ParameterError, ConvergenceError, BoundsError, logsumexp
 from pymbar.exp import EXP
 
 
@@ -111,7 +111,7 @@ def BARzero(w_F, w_R, DeltaF):
         # give up; if there's overflow, return zero
         print("The input data results in overflow in BAR")
         return np.nan
-    log_numer = _logsum(log_f_F) - np.log(T_F)
+    log_numer = logsumexp(log_f_F) - np.log(T_F)
 
     # Compute log_denominator.
     # log_denom = log < f(-W) exp[-W] >_R
@@ -123,7 +123,7 @@ def BARzero(w_F, w_R, DeltaF):
     except:
         print("The input data results in overflow in BAR")
         return np.nan
-    log_denom = _logsum(log_f_R) - np.log(T_R)
+    log_denom = logsumexp(log_f_R) - np.log(T_R)
 
     # This function must be zeroed to find a root
     fzero = DeltaF - (log_denom - log_numer)

diff --git a/pymbar/exp.py b/pymbar/exp.py
@@ -46,7 +46,7 @@
 # IMPORTS
 #=============================================================================================
 import numpy as np
-from pymbar.utils import _logsum
+from pymbar.utils import logsumexp
 
 #=============================================================================================
 # One-sided exponential averaging (EXP).
@@ -96,7 +96,7 @@ def EXP(w_F, compute_uncertainty=True, is_timeseries=False):
     T = float(np.size(w_F))  # number of work measurements
 
     # Estimate free energy difference by exponential averaging using DeltaF = - log < exp(-w_F) >
-    DeltaF = - (_logsum(- w_F) - np.log(T))
+    DeltaF = - (logsumexp(- w_F) - np.log(T))
 
     if compute_uncertainty:
         # Compute x_i = np.exp(-w_F_i - max_arg)

diff --git a/pymbar/mbar.py b/pymbar/mbar.py
@@ -42,7 +42,7 @@
 import numpy as np
 import numpy.linalg as linalg
 from pymbar import mbar_solvers
-from pymbar.utils import _logsum, kln_to_kn, kn_to_n, ParameterError
+from pymbar.utils import kln_to_kn, kn_to_n, ParameterError, logsumexp
 
 DEFAULT_SOLVER_PROTOCOL = mbar_solvers.DEFAULT_SOLVER_PROTOCOL
 DEFAULT_SUBSAMPLING_PROTOCOL = mbar_solvers.DEFAULT_SUBSAMPLING_PROTOCOL
@@ -155,7 +155,7 @@ def __init__(self, u_kn, N_k, maximum_iterations=10000, relative_tolerance=1.0e-
 
         # Store local copies of necessary data.
         # N_k[k] is the number of samples from state k, some of which might be zero.
-        self.N_k = np.array(N_k, dtype=np.int32)
+        self.N_k = np.array(N_k, dtype=np.int64)
         self.N = np.sum(self.N_k)
 
         # Get dimensions of reduced potential energy matrix, and convert to KxN form if needed.
@@ -184,7 +184,7 @@ def __init__(self, u_kn, N_k, maximum_iterations=10000, relative_tolerance=1.0e-
         if x_kindices is not None:
             self.x_kindices = x_kindices
         else:
-            self.x_kindices = np.arange(N, dtype=np.int32)
+            self.x_kindices = np.arange(N, dtype=np.int64)
             Nsum = 0
             for k in range(K):
                 self.x_kindices[Nsum:Nsum+N_k[k]] = k
@@ -223,7 +223,7 @@ def __init__(self, u_kn, N_k, maximum_iterations=10000, relative_tolerance=1.0e-
 
         # Determine list of k indices for which N_k != 0
         self.states_with_samples = np.where(self.N_k != 0)[0]
-        self.states_with_samples = self.states_with_samples.astype(np.int32)
+        self.states_with_samples = self.states_with_samples.astype(np.int64)
 
         # Number of states with samples.
         self.K_nonzero = self.states_with_samples.size
@@ -611,7 +611,7 @@ def computeExpectationsInner(self, A_n, u_ln, state_map,
         msize = K + NL + S # augmented size; all of the states needed to calculate
                            # the observables, and the observables themselves.
         Log_W_nk = np.zeros([N, msize], np.float64) # log weight matrix
-        N_k = np.zeros([msize], np.int32)  # counts
+        N_k = np.zeros([msize], np.int64)  # counts
         f_k = np.zeros([msize], np.float64)  # free energies
 
         # <A> = A(x_n) exp[f_{k} - q_{k}(x_n)] / \sum_{k'=1}^K N_{k'} exp[f_{k'} - q_{k'}(x_n)]
@@ -625,7 +625,7 @@ def computeExpectationsInner(self, A_n, u_ln, state_map,
         for l in L_list:
             la = K+l  #l, augmented
             Log_W_nk[:, la] = self._computeUnnormalizedLogWeights(u_ln[l,:]) 
-            f_k[la] = -_logsum(Log_W_nk[:, la])
+            f_k[la] = -logsumexp(Log_W_nk[:, la])
             Log_W_nk[:, la] += f_k[la]
 
         # Compute the remaining rows/columns of W_nk, and calculate
@@ -635,7 +635,7 @@ def computeExpectationsInner(self, A_n, u_ln, state_map,
             l = K + state_map[0,s]
             i = state_map[1,s]
             Log_W_nk[:, sa] = np.log(A_n[i, :]) + Log_W_nk[:, l]
-            f_k[sa] = -_logsum(Log_W_nk[:, sa])
+            f_k[sa] = -logsumexp(Log_W_nk[:, sa])
             Log_W_nk[:, sa] += f_k[sa]    # normalize this row
 
         # Compute estimates of A_i[s]
@@ -1252,7 +1252,7 @@ def computePMF(self, u_n, bin_n, nbins, uncertainties='from-lowest', pmf_referen
         >>> x_n_sorted = np.sort(x_n) # unroll to n-indices
         >>> bins = np.append(x_n_sorted[0::(N_tot/nbins)], x_n_sorted.max()+0.1)
         >>> bin_widths = bins[1:] - bins[0:-1]
-        >>> bin_n = np.zeros(x_n.shape, np.int32)
+        >>> bin_n = np.zeros(x_n.shape, np.int64)
         >>> bin_n = np.digitize(x_n, bins) - 1
         >>> # Compute PMF for these unequally-sized bins.
         >>> [f_i, df_i] = mbar.computePMF(u_n, bin_n, nbins)
@@ -1291,10 +1291,10 @@ def computePMF(self, u_n, bin_n, nbins, uncertainties='from-lowest', pmf_referen
                 raise "WARNING: bin %d has no samples -- all bins must have at least one sample." % i
 
             # Compute dimensionless free energy of occupying state i.
-            f_i[i] = - _logsum(log_w_n[indices])
+            f_i[i] = - logsumexp(log_w_n[indices])
 
         # Compute uncertainties by forming matrix of W_nk.
-        N_k = np.zeros([self.K + nbins], np.int32)
+        N_k = np.zeros([self.K + nbins], np.int64)
         N_k[0:K] = self.N_k
         W_nk = np.zeros([self.N, self.K + nbins], np.float64)
         W_nk[:, 0:K] = np.exp(self.Log_W_nk)
@@ -1351,7 +1351,7 @@ def computePMF(self, u_n, bin_n, nbins, uncertainties='from-lowest', pmf_referen
             # Determine uncertainties from normalization that \sum_i p_i = 1.
 
             # Compute bin probabilities p_i
-            p_i = np.exp(-f_i - _logsum(-f_i))
+            p_i = np.exp(-f_i - logsumexp(-f_i))
 
             # todo -- eliminate triple loop over nbins!
             # Compute uncertainties in bin probabilities.
@@ -1442,7 +1442,7 @@ def _computeAsymptoticCovarianceMatrix(self, W, N_k, method=None):
         REQUIRED ARGUMENTS
           W (np.array of np.float of dimension [N,K]) - matrix of normalized weights (see Eq. 9 of [1]) - W[n,k] is the weight of snapshot n (n = 1..N) in state k
                                           Note that sum(W(:,k)) = 1 for any k = 1..K, and sum(N_k(:) .* W(n,:)) = 1 for any n.
-          N_k (np.array of np.int32 of dimension [K]) - N_k[k] is the number of samples from state K
+          N_k (np.array of np.int64 of dimension [K]) - N_k[k] is the number of samples from state K
 
         RETURN VALUES
           Theta (KxK np float64 array) - asymptotic covariance matrix (see Eq. 8 of [1])
@@ -1642,4 +1642,4 @@ def _computeUnnormalizedLogWeights(self, u_n):
         REFERENCE
           'log weights' here refers to \log [ \sum_{k=1}^K N_k exp[f_k - (u_k(x_n) - u(x_n)] ]
         """
-        return -1. * mbar_solvers.logsumexp(self.f_k + u_n[:, np.newaxis] - self.u_kn.T, b=self.N_k, axis=1)
+        return -1. * logsumexp(self.f_k + u_n[:, np.newaxis] - self.u_kn.T, b=self.N_k, axis=1)
diff --git a/pymbar/mbar_solvers.py b/pymbar/mbar_solvers.py
@@ -2,83 +2,14 @@
 import numpy as np
 import math
 import scipy.optimize
-from pymbar.utils import ensure_type
-
-
-try:  # numexpr used in logsumexp when available.
-    import numexpr
-    HAVE_NUMEXPR = True
-except ImportError:
-    HAVE_NUMEXPR = False
+from pymbar.utils import ensure_type, logsumexp
 
 # Below are the recommended default protocols (ordered sequence of minimization algorithms / NLE solvers) for solving the MBAR equations.
 # Note: we use tuples instead of lists to avoid accidental mutability.
 DEFAULT_SUBSAMPLING_PROTOCOL = (dict(method="L-BFGS-B"), )  # First use BFGS on subsampled data.
 DEFAULT_SOLVER_PROTOCOL = (dict(method="hybr"), )  # Then do fmin hybrid on full dataset.
 
 
-
-def logsumexp(a, axis=None, b=None, use_numexpr=True):
-    """Compute the log of the sum of exponentials of input elements.
-
-    Parameters
-    ----------
-    a : array_like
-        Input array.
-    axis : None or int, optional, default=None
-        Axis or axes over which the sum is taken. By default `axis` is None,
-        and all elements are summed.
-    b : array-like, optional
-        Scaling factor for exp(`a`) must be of the same shape as `a` or
-        broadcastable to `a`.
-    use_numexpr : bool, optional, default=True
-        If True, use the numexpr library to speed up the calculation, which
-        can give a 2-4X speedup when working with large arrays.
-
-    Returns
-    -------
-    res : ndarray
-        The result, ``log(sum(exp(a)))`` calculated in a numerically
-        more stable way. If `b` is given then ``log(sum(b*exp(a)))``
-        is returned.
-
-    See Also
-    --------
-    numpy.logaddexp, numpy.logaddexp2, scipy.misc.logsumexp
-
-    Notes
-    -----
-    This is based on scipy.misc.logsumexp but with optional numexpr
-    support for improved performance.
-    """
-
-    a = np.asarray(a)
-
-    a_max = np.amax(a, axis=axis, keepdims=True)
-
-    if a_max.ndim > 0:
-        a_max[~np.isfinite(a_max)] = 0
-    elif not np.isfinite(a_max):
-        a_max = 0
-
-    if b is not None:
-        b = np.asarray(b)
-        if use_numexpr and HAVE_NUMEXPR:
-            out = np.log(numexpr.evaluate("b * exp(a - a_max)").sum(axis))
-        else:
-            out = np.log(np.sum(b * np.exp(a - a_max), axis=axis))
-    else:
-        if use_numexpr and HAVE_NUMEXPR:
-            out = np.log(numexpr.evaluate("exp(a - a_max)").sum(axis))
-        else:
-            out = np.log(np.sum(np.exp(a - a_max), axis=axis))
-
-    a_max = np.squeeze(a_max, axis=axis)
-    out += a_max
-
-    return out
-
-
 def validate_inputs(u_kn, N_k, f_k):
     """Check types and return inputs for MBAR calculations.
 

diff --git a/pymbar/tests/test_mbar_solvers.py b/pymbar/tests/test_mbar_solvers.py
@@ -4,27 +4,6 @@
 import scipy.misc
 from nose import SkipTest
 
-def test_logsumexp():
-    a = np.random.normal(size=(200, 500, 5))
-
-    for axis in range(a.ndim):
-        ans_ne = pymbar.mbar_solvers.logsumexp(a, axis=axis)
-        ans_no_ne = pymbar.mbar_solvers.logsumexp(a, axis=axis, use_numexpr=False)
-        ans_scipy = scipy.misc.logsumexp(a, axis=axis)
-        eq(ans_ne, ans_no_ne)
-        eq(ans_ne, ans_scipy)
-
-def test_logsumexp_b():
-    a = np.random.normal(size=(200, 500, 5))
-    b = np.random.normal(size=(200, 500, 5)) ** 2.
-
-    for axis in range(a.ndim):
-        ans_ne = pymbar.mbar_solvers.logsumexp(a, b=b, axis=axis)
-        ans_no_ne = pymbar.mbar_solvers.logsumexp(a, b=b, axis=axis, use_numexpr=False)
-        ans_scipy = scipy.misc.logsumexp(a, b=b, axis=axis)
-        eq(ans_ne, ans_no_ne)
-        eq(ans_ne, ans_scipy)
-
 
 def load_oscillators(n_states, n_samples):
     name = "%dx%d oscillators" % (n_states, n_samples)

diff --git a/pymbar/tests/test_utils.py b/pymbar/tests/test_utils.py
@@ -0,0 +1,32 @@
+import numpy as np
+import pymbar
+from pymbar.utils_for_testing import eq
+import scipy.misc
+from nose import SkipTest
+
+def test_logsumexp():
+    a = np.random.normal(size=(200, 500, 5))
+
+    for axis in range(a.ndim):
+        ans_ne = pymbar.utils.logsumexp(a, axis=axis)
+        ans_no_ne = pymbar.utils.logsumexp(a, axis=axis, use_numexpr=False)
+        ans_scipy = scipy.misc.logsumexp(a, axis=axis)
+        eq(ans_ne, ans_no_ne)
+        eq(ans_ne, ans_scipy)
+
+def test_logsumexp_b():
+    a = np.random.normal(size=(200, 500, 5))
+    b = np.random.normal(size=(200, 500, 5)) ** 2.
+
+    for axis in range(a.ndim):
+        ans_ne = pymbar.utils.logsumexp(a, b=b, axis=axis)
+        ans_no_ne = pymbar.utils.logsumexp(a, b=b, axis=axis, use_numexpr=False)
+        ans_scipy = scipy.misc.logsumexp(a, b=b, axis=axis)
+        eq(ans_ne, ans_no_ne)
+        eq(ans_ne, ans_scipy)
+
+def test_logsum():
+    u = np.random.normal(size=(200))
+    y1 = pymbar.utils.logsumexp(u)
+    y2 = pymbar.utils._logsum(u)
+    eq(y1, y2, decimal=12)