Merge pull request #143 from mrshirts/master

Adding effective sample functionality
choderalab · Jan 5, 2015 · 9b6690f · 9b6690f
2 parents 84649b9 + 1421f88
commit 9b6690f
Show file tree

Hide file tree

Showing 3 changed files with 107 additions and 2 deletions.
diff --git a/examples/harmonic-oscillators.py b/examples/harmonic-oscillators.py
@@ -530,6 +530,41 @@ def GetAnalytical(beta,K,O,observables):
 print "Overlap scalar output"
 print O
 
+print "============================================"
+print "    Testing computeEffectiveSampleNumber    "
+print "============================================"
+
+N_eff = mbar.computeEffectiveSampleNumber(verbose = True)
+print "Effective Sample number"
+print N_eff
+print "Compare stanadrd estimate of <x> with the MBAR estimate of <x>"
+print "We should have that with MBAR, err_MBAR = sqrt(N_k/N_eff)*err_standard,"
+print "so standard (scaled) results should be very close to MBAR results."
+print "No standard estimate exists for states that are not sampled."
+A_kn = x_kn
+(val_mbar, err_mbar) = mbar.computeExpectations(A_kn)
+err_standard = numpy.zeros([K],dtype = numpy.float64)
+err_scaled = numpy.zeros([K],dtype = numpy.float64)
+
+for k in range(K):
+  if N_k[k] != 0:
+    # use position
+    err_standard[k] = numpy.std(A_kn[k,0:N_k[k]])/numpy.sqrt(N_k[k]-1)
+    err_scaled[k] = numpy.std(A_kn[k,0:N_k[k]])/numpy.sqrt(N_eff[k]-1)
+
+print "                    ",
+for k in range(K):
+  print "       %d   " %(k),
+print ""
+print "MBAR             :",
+print err_mbar
+print "standard         :",
+print err_standard
+print "sqrt N_k/N_eff   :",
+print numpy.sqrt(N_k/N_eff)
+print "Standard (scaled):",
+print err_standard * numpy.sqrt(N_k/N_eff)
+
 print "============================================"
 print "      Testing computePMF   "
 print "============================================"

diff --git a/pymbar/mbar.py b/pymbar/mbar.py
@@ -290,6 +290,63 @@ def getWeights(self):
 
         return np.exp(self.Log_W_nk)
 
+    #=========================================================================
+    def computeEffectiveSampleNumber(self, verbose = False):
+        """
+        Compute the effective sample number of each state;
+        essentially, an estimate of how many samples are contributing to the average
+        at given state.  See pymbar/examples for a demonstration.
+
+        It also counts the efficiency of the sampling, which is simply the ratio
+        of the effective number of samples at a given state to the total number
+        of samples collected.
+        
+        Returns
+        -------
+        N_eff : np.ndarray, float, shape=(K)
+                estimated number of states contributing to estimates at each 
+                state i. An estimate to how many samples collected just at state 
+                i would result in similar statistical efficiency as the MBAR 
+                simulation. Valid for both sampled states (in which the weight 
+                will be greater than N_k[i], and unsampled states.
+
+        Parameters
+        ----------
+        verbose : print out information about the effective number of samples
+
+        Notes
+        -----
+
+        # using Kish (1965) formula (Kish, Leslie (1965). Survey Sampling. New York: Wiley)
+        # As the weights become more concentrated in fewer observations, the effective sample size shrinks.
+        # from http://healthcare-economist.com/2013/08/22/effective-sample-size/
+        # effective # of samples contributing to averages carried out at state i 
+        #                        =  (\sum_{n=1}^N w_in)^2 / \sum_{n=1}^N w_in^2
+        #                        =  (\sum_{n=1}^N w_in^2)^-1
+        #
+        # the effective sample number is most useful to diagnose when there are only a few samples
+        # contributing to the averages.
+
+        Examples
+        --------
+        
+        >>> from pymbar import testsystems
+        >>> [x_kn, u_kln, N_k, s_n] = testsystems.HarmonicOscillatorsTestCase().sample()
+        >>> mbar = MBAR(u_kln, N_k)
+        >>> N_eff = mbar.computeEffectiveSampleNumber()
+        """
+
+        N_eff = np.zeros(self.K)
+        for k in range(self.K):
+            w = np.exp(self.Log_W_nk[:,k])
+            N_eff[k] = 1/np.sum(w**2)
+
+            if verbose:
+                print "Effective number of sample in state %d is %10.3f" % (k,N_eff[k])
+                print "Efficiency for state %d is %d/%d = %10.4f" % (k,N_eff[k],len(w),N_eff[k]/len(w))
+
+        return N_eff
+
     #=========================================================================
     def computeOverlap(self, output='scalar'):
         """Compute estimate of overlap matrix between the states.

diff --git a/pymbar/tests/test_mbar.py b/pymbar/tests/test_mbar.py
@@ -187,9 +187,22 @@ def test_mbar_computeEntropyAndEnthalpy():
         z = convert_to_differences(s_ij,ds_ij,sa)
         eq(z / z_scale_factor, np.zeros(np.shape(z)), decimal=0)
 
-def test_mbar_computeOverlap():
+def test_mbar_computeEffectiveSampleNumber():
+    """ testing computeEffectiveSampleNumber """
 
-    """ testing computeOverlap """
+    for system_generator in system_generators:
+        name, test = system_generator()
+        x_n, u_kn, N_k_output, s_n = test.sample(N_k, mode='u_kn')
+        eq(N_k, N_k_output)
+        mbar = MBAR(u_kn, N_k)
+
+        # one mathematical effective sample numbers should be between N_k and sum_k N_k
+        N_eff = mbar.computeEffectiveSampleNumber()
+        sumN = np.sum(N_k)
+        for k in range(len(N_eff)):
+            eq(np.bool(N_eff[k] > N_k[k] and N_eff[k] < sumN),True)
+
+def test_mbar_computeOverlap():
 
     # tests with identical states, which gives analytical results.