Skip to content

Commit

Permalink
Save samples for inference methods (#282)
Browse files Browse the repository at this point in the history
* Add draft version of progress bar

* Add requirements. Split progress bar into own class

* Add classification of samplers for progress bar

* Fix crashing the tests related to BOLFI (temporary plug). Improve getting the samplers name from class attribute

* Update completed progress bar

* Fix bug for external samplers

* Fix formatting in parameter_inference module. Add documentation in progress_bar module

* Fix errata

* Initial commit

* Merge with dev branch

* Add saving samples in csv and json formats

* Fix documentation

* Fix variable name

* Update changelog

* Reimplement saving samples with python internal modules

* Optimize saveing samples in json format

* Fix code review comments. Update changelog

* Add pickle format to save data during inference. Extend data to save for json and pickle formats

* Improve documentation. Add more information about saving objects. Improve saving to json

* Split code related to processing sample objects into function. Split converting numpy types into python types into function. Reformat code for saving data in json format. Save the whole sample object into pickle format.

* Add unit tests for general utility functions. Improve documentation

* Change the number of simulations in testing functions

* Add magic methods for saving self object properly

* Improve tests to increase the running time
  • Loading branch information
b5y authored and vuolleko committed Aug 23, 2018
1 parent 83be924 commit f628a37
Show file tree
Hide file tree
Showing 4 changed files with 175 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ dev
- Fix bug in plot_marginals which outputs empty plots in case where we have parameter more than 5
- Fix crashing summary and plots for samples with multivariate priors
- Add progress bar for inference methods
- Add method save to Sample objects

0.7.2 (2018-06-20)
------------------
Expand Down
73 changes: 73 additions & 0 deletions elfi/methods/results.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
"""Containers for results from inference."""

import io
import itertools
import logging
import os
import string
import sys
from collections import OrderedDict

import numpy as np
from matplotlib import pyplot as plt

import elfi.visualization.visualization as vis
from elfi.methods.utils import numpy_to_python_type, sample_object_to_dict

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -203,6 +207,75 @@ def sample_means_array(self):
"""
return np.array(list(self.sample_means.values()))

def __getstate__(self):
return self.meta, self.__dict__

def __setstate__(self, state):
self.meta, self.__dict__ = state

def save(self, fname=None):
"""Save samples in csv, json or pickle file formats.
Clarification: csv saves only samples, json saves the whole object's dictionary except
`outputs` key and pickle saves the whole object.
Parameters
----------
fname : str, required
File name to be saved. The type is inferred from extension ('csv', 'json' or 'pkl').
"""
import csv
import json
import pickle

kind = os.path.splitext(fname)[1][1:]

if kind == 'csv':
with open(fname, 'w', newline='') as f:
w = csv.writer(f)
w.writerow(self.samples.keys())
w.writerows(itertools.zip_longest(*self.samples.values(), fillvalue=''))
elif kind == 'json':
with open(fname, 'w') as f:

data = OrderedDict()

data['n_samples'] = self.n_samples
data['discrepancies'] = self.discrepancies
data['dim'] = self.dim

# populations key exists in SMC-ABC sampler and contains the history of all
# inferences with different number of simulations and thresholds
populations = 'populations'
if populations in self.__dict__:
# setting populations in the following form:
# data = {'populations': {'A': dict(), 'B': dict()}, ...}
# this helps to save all kind of populations
pop_num = string.ascii_letters.upper()[:len(self.__dict__[populations])]
data[populations] = OrderedDict()
for n, elem in enumerate(self.__dict__[populations]):
data[populations][pop_num[n]] = OrderedDict()
sample_object_to_dict(data[populations][pop_num[n]], elem)

# convert numpy types into python types in populations key
for key, val in data[populations].items():
numpy_to_python_type(val)

# skip populations because it was processed previously
sample_object_to_dict(data, self, skip='populations')

# convert numpy types into python types
numpy_to_python_type(data)

js = json.dumps(data)
f.write(js)
elif kind == 'pkl':
with open(fname, 'wb') as f:
pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
else:
print("Wrong file type format. Please use 'csv', 'json' or 'pkl'.")

def plot_marginals(self, selector=None, bins=20, axes=None, **kwargs):
"""Plot marginal distributions for parameters.
Expand Down
62 changes: 62 additions & 0 deletions elfi/methods/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,3 +418,65 @@ def gradient_logpdf(self, x, stepsize=None):

def _to_batch(self, x):
return {p: x[:, i] for i, p in enumerate(self.parameter_names)}


def sample_object_to_dict(data, elem, skip=''):
"""Process data from self object to data dictionary to prepare for json serialization.
Parameters
----------
data : dict, required
Stores collected data for json
elem : dict, required
Default data from Sample object(s)
skip : str, optional
Some keys in the object should be skipped, such as `outputs` or `populations`. Latter
is skipped in case if it is already processed previously.
"""
for key, val in elem.__dict__.items():
# skip `outputs` because its values are in `samples` and in `discrepancies`
if key in ['outputs', skip]:
continue
if key == 'meta':
for meta_key, meta_val in elem.__dict__[key].items():
data[meta_key] = meta_val
continue
data[key] = val


def numpy_to_python_type(data):
"""Convert numpy data types to python data type for json serialization.
Parameters
----------
data : dict, required
Stores collected data for json
"""
for key, val in data.items():
# in data there is keys as 'samples' which is actually a dictionary
if isinstance(val, dict):
for nested_key, nested_val in val.items():
is_numpy = type(nested_val)
data_type = str(is_numpy)
# check whether the current value has numpy data type
if is_numpy.__module__ == np.__name__:
# it is enough to check that current value's name has one of these sub-strings
# https://docs.scipy.org/doc/numpy-1.13.0/user/basics.types.html
if 'array' in data_type:
data[key][nested_key] = nested_val.tolist()
elif 'int' in data_type:
data[key][nested_key] = int(nested_val)
elif 'float' in data_type:
data[key][nested_key] = float(nested_val)

is_numpy = type(val)
data_type = str(is_numpy)
if is_numpy.__module__ == np.__name__:
if 'array' in data_type:
data[key] = val.tolist()
elif 'int' in data_type:
data[key] = int(val)
elif 'float' in data_type:
data[key] = float(val)
40 changes: 39 additions & 1 deletion tests/unit/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import json
from collections import OrderedDict

import numpy as np
import scipy.stats as ss

import elfi
from elfi.examples.ma2 import get_model
from elfi.methods.bo.utils import minimize, stochastic_optimization
from elfi.methods.utils import GMDistribution, ModelPrior, normalize_weights, numgrad, weighted_var
from elfi.methods.utils import (GMDistribution, ModelPrior, normalize_weights, numgrad,
numpy_to_python_type, sample_object_to_dict, weighted_var)


def test_stochastic_optimization():
Expand Down Expand Up @@ -149,3 +154,36 @@ def test_numerical_grad_logpdf(self):
prior_node = elfi.Prior('normal', loc, scale, model=elfi.ElfiModel())
num_grad = ModelPrior(prior_node.model).gradient_logpdf(x)
assert np.isclose(num_grad, analytical_grad_logpdf, atol=0.01)


def test_sample_object_to_dict():
data_rej = OrderedDict()
data_smc = OrderedDict()
m = get_model(n_obs=100, true_params=[.6, .2])
batch_size, n = 1, 2
schedule = [0.7, 0.2, 0.05]
rej = elfi.Rejection(m['d'], batch_size=batch_size)
res_rej = rej.sample(n, threshold=0.1)
smc = elfi.SMC(m['d'], batch_size=batch_size)
res_smc = smc.sample(n, schedule)
sample_object_to_dict(data_rej, res_rej)
sample_object_to_dict(data_smc, res_smc, skip='populations')
assert any(x not in data_rej for x in ['meta', 'output']) is True
assert any(x not in data_smc for x in ['meta', 'output', 'populations']) is True


def test_numpy_to_python_type():
data = dict(a=np.array([1, 2, 3, 4]), b=np.uint(5), c=np.float(10),
d=dict(a=np.array([0, 9, 8, 7]), b=np.uint(15), c=np.float(12)))
numpy_to_python_type(data)

# checking that our objects are jsonable is enough to be sure that numpy_to_python_type
# function works fine
def is_jsonable(x):
try:
json.dumps(x)
return True
except:
return False

assert is_jsonable(data) is True

0 comments on commit f628a37

Please sign in to comment.