Save samples for inference methods (#282)

* Add draft version of progress bar * Add requirements. Split progress bar into own class * Add classification of samplers for progress bar * Fix crashing the tests related to BOLFI (temporary plug). Improve getting the samplers name from class attribute * Update completed progress bar * Fix bug for external samplers * Fix formatting in parameter_inference module. Add documentation in progress_bar module * Fix errata * Initial commit * Merge with dev branch * Add saving samples in csv and json formats * Fix documentation * Fix variable name * Update changelog * Reimplement saving samples with python internal modules * Optimize saveing samples in json format * Fix code review comments. Update changelog * Add pickle format to save data during inference. Extend data to save for json and pickle formats * Improve documentation. Add more information about saving objects. Improve saving to json * Split code related to processing sample objects into function. Split converting numpy types into python types into function. Reformat code for saving data in json format. Save the whole sample object into pickle format. * Add unit tests for general utility functions. Improve documentation * Change the number of simulations in testing functions * Add magic methods for saving self object properly * Improve tests to increase the running time
elfi-dev · Aug 23, 2018 · f628a37 · f628a37
1 parent 83be924
commit f628a37
Show file tree

Hide file tree

Showing 4 changed files with 175 additions and 1 deletion.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,7 @@ dev
 - Fix bug in plot_marginals which outputs empty plots in case where we have parameter more than 5
 - Fix crashing summary and plots for samples with multivariate priors
 - Add progress bar for inference methods
+- Add method save to Sample objects
 
 0.7.2 (2018-06-20)
 ------------------

diff --git a/elfi/methods/results.py b/elfi/methods/results.py
@@ -1,14 +1,18 @@
 """Containers for results from inference."""
 
 import io
+import itertools
 import logging
+import os
+import string
 import sys
 from collections import OrderedDict
 
 import numpy as np
 from matplotlib import pyplot as plt
 
 import elfi.visualization.visualization as vis
+from elfi.methods.utils import numpy_to_python_type, sample_object_to_dict
 
 logger = logging.getLogger(__name__)
 
@@ -203,6 +207,75 @@ def sample_means_array(self):
         """
         return np.array(list(self.sample_means.values()))
 
+    def __getstate__(self):
+        return self.meta, self.__dict__
+
+    def __setstate__(self, state):
+        self.meta, self.__dict__ = state
+
+    def save(self, fname=None):
+        """Save samples in csv, json or pickle file formats.
+
+        Clarification: csv saves only samples, json saves the whole object's dictionary except
+        `outputs` key and pickle saves the whole object.
+
+        Parameters
+        ----------
+        fname : str, required
+            File name to be saved. The type is inferred from extension ('csv', 'json' or 'pkl').
+
+        """
+        import csv
+        import json
+        import pickle
+
+        kind = os.path.splitext(fname)[1][1:]
+
+        if kind == 'csv':
+            with open(fname, 'w', newline='') as f:
+                w = csv.writer(f)
+                w.writerow(self.samples.keys())
+                w.writerows(itertools.zip_longest(*self.samples.values(), fillvalue=''))
+        elif kind == 'json':
+            with open(fname, 'w') as f:
+
+                data = OrderedDict()
+
+                data['n_samples'] = self.n_samples
+                data['discrepancies'] = self.discrepancies
+                data['dim'] = self.dim
+
+                # populations key exists in SMC-ABC sampler and contains the history of all
+                # inferences with different number of simulations and thresholds
+                populations = 'populations'
+                if populations in self.__dict__:
+                    # setting populations in the following form:
+                    # data = {'populations': {'A': dict(), 'B': dict()}, ...}
+                    # this helps to save all kind of populations
+                    pop_num = string.ascii_letters.upper()[:len(self.__dict__[populations])]
+                    data[populations] = OrderedDict()
+                    for n, elem in enumerate(self.__dict__[populations]):
+                        data[populations][pop_num[n]] = OrderedDict()
+                        sample_object_to_dict(data[populations][pop_num[n]], elem)
+
+                    # convert numpy types into python types in populations key
+                    for key, val in data[populations].items():
+                        numpy_to_python_type(val)
+
+                # skip populations because it was processed previously
+                sample_object_to_dict(data, self, skip='populations')
+
+                # convert numpy types into python types
+                numpy_to_python_type(data)
+
+                js = json.dumps(data)
+                f.write(js)
+        elif kind == 'pkl':
+            with open(fname, 'wb') as f:
+                pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
+        else:
+            print("Wrong file type format. Please use 'csv', 'json' or 'pkl'.")
+
     def plot_marginals(self, selector=None, bins=20, axes=None, **kwargs):
         """Plot marginal distributions for parameters.
 

diff --git a/elfi/methods/utils.py b/elfi/methods/utils.py
@@ -418,3 +418,65 @@ def gradient_logpdf(self, x, stepsize=None):
 
     def _to_batch(self, x):
         return {p: x[:, i] for i, p in enumerate(self.parameter_names)}
+
+
+def sample_object_to_dict(data, elem, skip=''):
+    """Process data from self object to data dictionary to prepare for json serialization.
+
+    Parameters
+    ----------
+    data : dict, required
+        Stores collected data for json
+    elem : dict, required
+        Default data from Sample object(s)
+    skip : str, optional
+        Some keys in the object should be skipped, such as `outputs` or `populations`. Latter
+        is skipped in case if it is already processed previously.
+
+    """
+    for key, val in elem.__dict__.items():
+        # skip `outputs` because its values are in `samples` and in `discrepancies`
+        if key in ['outputs', skip]:
+            continue
+        if key == 'meta':
+            for meta_key, meta_val in elem.__dict__[key].items():
+                data[meta_key] = meta_val
+            continue
+        data[key] = val
+
+
+def numpy_to_python_type(data):
+    """Convert numpy data types to python data type for json serialization.
+
+    Parameters
+    ----------
+    data : dict, required
+        Stores collected data for json
+
+    """
+    for key, val in data.items():
+        # in data there is keys as 'samples' which is actually a dictionary
+        if isinstance(val, dict):
+            for nested_key, nested_val in val.items():
+                is_numpy = type(nested_val)
+                data_type = str(is_numpy)
+                # check whether the current value has numpy data type
+                if is_numpy.__module__ == np.__name__:
+                    # it is enough to check that current value's name has one of these sub-strings
+                    # https://docs.scipy.org/doc/numpy-1.13.0/user/basics.types.html
+                    if 'array' in data_type:
+                        data[key][nested_key] = nested_val.tolist()
+                    elif 'int' in data_type:
+                        data[key][nested_key] = int(nested_val)
+                    elif 'float' in data_type:
+                        data[key][nested_key] = float(nested_val)
+
+        is_numpy = type(val)
+        data_type = str(is_numpy)
+        if is_numpy.__module__ == np.__name__:
+            if 'array' in data_type:
+                data[key] = val.tolist()
+            elif 'int' in data_type:
+                data[key] = int(val)
+            elif 'float' in data_type:
+                data[key] = float(val)
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
@@ -1,9 +1,14 @@
+import json
+from collections import OrderedDict
+
 import numpy as np
 import scipy.stats as ss
 
 import elfi
+from elfi.examples.ma2 import get_model
 from elfi.methods.bo.utils import minimize, stochastic_optimization
-from elfi.methods.utils import GMDistribution, ModelPrior, normalize_weights, numgrad, weighted_var
+from elfi.methods.utils import (GMDistribution, ModelPrior, normalize_weights, numgrad,
+                                numpy_to_python_type, sample_object_to_dict, weighted_var)
 
 
 def test_stochastic_optimization():
@@ -149,3 +154,36 @@ def test_numerical_grad_logpdf(self):
         prior_node = elfi.Prior('normal', loc, scale, model=elfi.ElfiModel())
         num_grad = ModelPrior(prior_node.model).gradient_logpdf(x)
         assert np.isclose(num_grad, analytical_grad_logpdf, atol=0.01)
+
+
+def test_sample_object_to_dict():
+    data_rej = OrderedDict()
+    data_smc = OrderedDict()
+    m = get_model(n_obs=100, true_params=[.6, .2])
+    batch_size, n = 1, 2
+    schedule = [0.7, 0.2, 0.05]
+    rej = elfi.Rejection(m['d'], batch_size=batch_size)
+    res_rej = rej.sample(n, threshold=0.1)
+    smc = elfi.SMC(m['d'], batch_size=batch_size)
+    res_smc = smc.sample(n, schedule)
+    sample_object_to_dict(data_rej, res_rej)
+    sample_object_to_dict(data_smc, res_smc, skip='populations')
+    assert any(x not in data_rej for x in ['meta', 'output']) is True
+    assert any(x not in data_smc for x in ['meta', 'output', 'populations']) is True
+
+
+def test_numpy_to_python_type():
+    data = dict(a=np.array([1, 2, 3, 4]), b=np.uint(5), c=np.float(10),
+                d=dict(a=np.array([0, 9, 8, 7]), b=np.uint(15), c=np.float(12)))
+    numpy_to_python_type(data)
+
+    # checking that our objects are jsonable is enough to be sure that numpy_to_python_type
+    # function works fine
+    def is_jsonable(x):
+        try:
+            json.dumps(x)
+            return True
+        except:
+            return False
+
+    assert is_jsonable(data) is True