Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Msm submodels #39

Merged
merged 27 commits into from
Jan 24, 2020
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
64c133b
some refactoring
clonker Jan 14, 2020
7bfad88
refactor count model (wip)
clonker Jan 14, 2020
16f302c
[markovprocess/transition counting] count model can create submodels
clonker Jan 15, 2020
79bee93
[markovprocess/transition counting] pep conformity, removed unused pa…
clonker Jan 15, 2020
48ba2db
[markovprocess] dt_traj -> physical_time, mindist_connectivity -> con…
clonker Jan 15, 2020
7772acb
[markovprocess] ML-MSM refactor
clonker Jan 15, 2020
2ca3288
[markovprocess] ML-MSM tests, remove CK-test testing
clonker Jan 16, 2020
1570efa
[markovprocess] ML-MSM tests, remove CK-test testing
clonker Jan 16, 2020
8d16df2
[markovprocess] ML-MSM tests, remove CK-test testing
clonker Jan 16, 2020
53d4a06
[markovprocess] time unit instead of physical time or dt_traj, improv…
clonker Jan 16, 2020
973f7af
[markovprocess] fixing transition counting model input errorhandling
clonker Jan 16, 2020
cdfc902
[markovprocess] bayesian msm
clonker Jan 16, 2020
e966102
[markovprocess/scoring] provide lagtime for scoring
clonker Jan 16, 2020
45df35f
[markovprocess/ml-hmsm] remove cktest interface
clonker Jan 20, 2020
cc34296
[markovprocess/hmsm] submodel implementation does not take mincount-c…
clonker Jan 21, 2020
efc61bd
[markovprocess/ml-hmsm] fix hmsm tests
clonker Jan 22, 2020
b91c8fb
[markovprocess/ml-hmsm] fix hmsm tests cont.
clonker Jan 22, 2020
b806398
[markovprocess] doc fixes and more argument checking
clonker Jan 22, 2020
3641317
[markovprocess/counting] transition count model testing
clonker Jan 24, 2020
689ef73
add some tests on patological data
thempel Jan 24, 2020
8564df4
f
thempel Jan 24, 2020
5293280
[markovprocess/msm] testing
clonker Jan 24, 2020
5650b8e
Merge remote-tracking branch 'clonker/msm_submodels' into msm_submodels
thempel Jan 24, 2020
93815cc
[markovprocess/msm] further testing
clonker Jan 24, 2020
32eaab4
[markovprocess/bmsm] testing
clonker Jan 24, 2020
3a2f902
[markovprocess/msm] pathological cases -> test matrix
thempel Jan 24, 2020
a574349
Merge branch 'msm_submodels' of github.com:clonker/scikit-time into m…
thempel Jan 24, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 21 additions & 18 deletions sktime/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,19 @@


class _base_methods_mixin(object, metaclass=abc.ABCMeta):
""" defines common methods used by both Estimator and Model classes.
""" Defines common methods used by both Estimator and Model classes. These are mostly static and low-level
checking of conformity with respect to scikit-time conventions.
"""

def __repr__(self):
name = '{cls}-{id}:'.format(id=id(self), cls=self.__class__.__name__)
return '{name}{params}]'.format(name=name,
params=pprint_sklearn(self.get_params(), offset=len(name), )
params=pprint_sklearn(self.get_params(), offset=len(name), )
)

def get_params(self, deep=True):
"""Get parameters of this kernel.
Parameters
----------
deep : boolean, optional
If True, will return the parameters for this estimator and
contained subobjects that are estimators.
def get_params(self):
r"""Get parameters of this kernel.

Returns
-------
params : mapping of string to any
Expand Down Expand Up @@ -140,9 +137,9 @@ def data(self):

@data.setter
def data(self, value_):
import numpy as np
args, kwargs = value_
# store data as a list of ndarrays
import numpy as np
# handle optional y for supervised learning
y = kwargs.get('y', None)

Expand All @@ -165,21 +162,27 @@ def data(self, value_):
self._data.append(x)
else:
raise InputFormatError(f'Invalid input element in position {i}, only numpy.ndarrays allowed.')
elif isinstance(value, Model):
self._data.append(value)
else:
raise InputFormatError(f'Only ndarray or list/tuple of ndarray allowed. But was of type {type(value)}'
f' and looks like {value}.')
raise InputFormatError(f'Only model, ndarray or list/tuple of ndarray allowed. '
f'But was of type {type(value)}: {value}.')

def __enter__(self):
import numpy as np
self.old_writable_flags = []
for array in self.data:
self.old_writable_flags.append(array.flags.writeable)
# set ndarray writabe flags to false
array.flags.writeable = False
for d in self.data:
if isinstance(d, np.ndarray):
self.old_writable_flags.append(d.flags.writeable)
# set ndarray writabe flags to false
d.flags.writeable = False

def __exit__(self, exc_type, exc_val, exc_tb):
# restore ndarray writable flags to old state
for array, writable in zip(self.data, self.old_writable_flags):
array.flags.writeable = writable
import numpy as np
for d, writable in zip(self.data, self.old_writable_flags):
if isinstance(d, np.ndarray):
d.flags.writeable = writable

def __call__(self, *args, **kwargs):
# extract input data from args, **kwargs (namely x and y)
Expand Down
4 changes: 1 addition & 3 deletions sktime/clustering/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def __init__(self, n_clusters, max_iter=5, metric=None,
This is used to resume the kmeans iteration. Note, that if this is set, the init_strategy is ignored and
the centers are directly passed to the kmeans iteration algorithm.
"""

super(KmeansClustering, self).__init__()
if n_jobs is None:
# todo: sensible choice?
# todo in sklearn: None -> 1 job, -1 -> all cpus (logical)
Expand All @@ -105,8 +105,6 @@ def __init__(self, n_clusters, max_iter=5, metric=None,
self.n_jobs = n_jobs
self.initial_centers = initial_centers

super(KmeansClustering, self).__init__()

def fetch_model(self) -> KMeansClusteringModel:
return self._model

Expand Down
10 changes: 5 additions & 5 deletions sktime/data/double_well.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class DoubleWellDiscrete(object):
def __init__(self):
dtraj, msm = _load_double_well_discrete()
self._dtraj = dtraj
self._msm = msm
self._analytic_msm = msm

@property
def dtraj(self):
Expand Down Expand Up @@ -65,12 +65,12 @@ def dtraj_n(self, divides):
@property
def transition_matrix(self):
""" Exact transition matrix used to generate the data """
return self.msm.transition_matrix
return self.analytic_msm.transition_matrix

@property
def msm(self):
def analytic_msm(self):
""" Returns an MSM object with the exact transition matrix """
return self._msm
return self._analytic_msm

def simulate_trajectory(self, n_steps, start=None, stop=None, dt=1) -> _np.ndarray:
"""
Expand All @@ -85,7 +85,7 @@ def simulate_trajectory(self, n_steps, start=None, stop=None, dt=1) -> _np.ndarr
-------
a discrete trajectory
"""
return self.msm.simulate(n_steps, start=start, stop=stop, dt=dt)
return self.analytic_msm.simulate(n_steps, start=start, stop=stop, dt=dt)

def simulate_trajectories(self, n_trajectories: int, n_steps: int,
start=None, stop=None, dt=1) -> List[_np.ndarray]:
Expand Down
3 changes: 1 addition & 2 deletions sktime/markovprocess/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@
from .maximum_likelihood_msm import MaximumLikelihoodMSM
from .bayesian_msm import BayesianMSM
from .pcca import pcca
from .transition_counting import TransitionCountEstimator, TransitionCountModel

from .reactive_flux import ReactiveFlux

from ._base import score_cv
from .chapman_kolmogorov_validator import cktest

166 changes: 86 additions & 80 deletions sktime/markovprocess/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,87 +4,107 @@

from sktime.base import Estimator, Model
from sktime.markovprocess import MarkovStateModel
from sktime.markovprocess.transition_counting import blocksplit_dtrajs, cvsplit_dtrajs
# TODO: we do not need this anymore!
from sktime.util import confidence_interval, ensure_dtraj_list


class _MSMBaseEstimator(Estimator):
r"""Maximum likelihood estimator for MSMs given discrete trajectory statistics
def blocksplit_dtrajs(dtrajs, lag=1, sliding=True, shift=None, random_state=None):
""" Splits the discrete trajectories into approximately uncorrelated fragments

Will split trajectories into fragments of lengths lag or longer. These fragments
are overlapping in order to conserve the transition counts at given lag.
If sliding=True, the resulting trajectories will lead to exactly the same count
matrix as when counted from dtrajs. If sliding=False (sampling at lag), the
count matrices are only equal when also setting shift=0.

Parameters
----------
dtrajs : list of ndarray(int)
Discrete trajectories
lag : int
lag time at which transitions are counted and the transition matrix is
estimated.
Lag time at which counting will be done. If sh
clonker marked this conversation as resolved.
Show resolved Hide resolved
sliding : bool
True for splitting trajectories for sliding count, False if lag-sampling will be applied
shift : None or int
Start of first full tau-window. If None, shift will be randomly generated

reversible : bool, optional, default = True
If true compute reversible MarkovStateModel, else non-reversible MarkovStateModel
"""
from sklearn.utils.random import check_random_state
dtrajs_new = []
random_state = check_random_state(random_state)
for dtraj in dtrajs:
if len(dtraj) <= lag:
continue
if shift is None:
s = random_state.randint(min(lag, dtraj.size - lag))
else:
s = shift
if sliding:
if s > 0:
dtrajs_new.append(dtraj[0:lag + s])
for t0 in range(s, dtraj.size - lag, lag):
dtrajs_new.append(dtraj[t0:t0 + 2 * lag])
else:
for t0 in range(s, dtraj.size - lag, lag):
dtrajs_new.append(dtraj[t0:t0 + lag + 1])
return dtrajs_new

count_mode : str, optional, default='sliding'
mode to obtain count matrices from discrete trajectories. Should be
one of:

* 'sliding' : A trajectory of length T will have :math:`T-\tau` counts
at time indexes
def cvsplit_dtrajs(dtrajs, random_state=None):
""" Splits the trajectories into a training and test set with approximately equal number of trajectories

.. math::
Parameters
----------
dtrajs : list of ndarray(int)
Discrete trajectories

(0 \rightarrow \tau), (1 \rightarrow \tau+1), ..., (T-\tau-1 \rightarrow T-1)
"""
from sklearn.utils.random import check_random_state
if len(dtrajs) == 1:
raise ValueError('Only have a single trajectory. Cannot be split into train and test set')
random_state = check_random_state(random_state)
I0 = random_state.choice(len(dtrajs), int(len(dtrajs) / 2), replace=False)
I1 = np.array(list(set(list(np.arange(len(dtrajs)))) - set(list(I0))))
dtrajs_train = [dtrajs[i] for i in I0]
dtrajs_test = [dtrajs[i] for i in I1]
return dtrajs_train, dtrajs_test

* 'effective' : Uses an estimate of the transition counts that are
statistically uncorrelated. Recommended when used with a
Bayesian MarkovStateModel.
* 'sample' : A trajectory of length T will have :math:`T/\tau` counts
at time indexes

.. math::
class _MSMBaseEstimator(Estimator):
r"""Maximum likelihood estimator for MSMs given discrete trajectory statistics

(0 \rightarrow \tau), (\tau \rightarrow 2 \tau), ..., (((T/\tau)-1) \tau \rightarrow T)
Parameters
----------
reversible : bool, optional, default = True
If true compute reversible MarkovStateModel, else non-reversible MarkovStateModel

sparse : bool, optional, default = False
If true compute count matrix, transition matrix and all derived
quantities using sparse matrix algebra. In this case python sparse
matrices will be returned by the corresponding functions instead of
numpy arrays. This behavior is suggested for very large numbers of
states (e.g. > 4000) because it is likely to be much more efficient.

dt_traj : str, optional, default='1 step'
Description of the physical time of the input trajectories. May be used
by analysis algorithms such as plotting tools to pretty-print the axes.
By default '1 step', i.e. there is no physical time unit. Specify by a
number, whitespace and unit. Permitted units are (* is an arbitrary
string). E.g. 200 picoseconds or 200ps.

mincount_connectivity : float or '1/n'
minimum number of counts to consider a connection between two states.
Counts lower than that will count zero in the connectivity check and
may thus separate the resulting transition matrix. The default
evaluates to 1/n_states.

"""

def __init__(self, lagtime=1, reversible=True, count_mode='sliding', sparse=False,
dt_traj='1 step', mincount_connectivity='1/n'):
def __init__(self, reversible=True, sparse=False):
super(_MSMBaseEstimator, self).__init__()
self.lagtime = lagtime

# set basic parameters
self.reversible = reversible

# sparse matrix computation wanted?
self.sparse = sparse

# store counting mode (lowercase)
self.count_mode = count_mode
if self.count_mode not in ('sliding', 'effective', 'sample'):
raise ValueError('count mode ' + count_mode + ' is unknown.')
@property
def reversible(self) -> bool:
return self._reversible

# time step
self.dt_traj = dt_traj
@reversible.setter
def reversible(self, value: bool):
self._reversible = value

# connectivity
self.mincount_connectivity = mincount_connectivity
@property
def sparse(self) -> bool:
return self._sparse

@sparse.setter
def sparse(self, value: bool):
self._sparse = value


class BayesianPosterior(Model):
Expand Down Expand Up @@ -115,28 +135,6 @@ def gather_stats(self, quantity, store_samples=False, *args, **kwargs):
samples = [call_member(s, quantity, *args, **kwargs) for s in self]
return QuantityStatistics(samples, quantity=quantity, store_samples=store_samples)

def submodel_largest(self, strong=True, mincount_connectivity='1/n', observe_nonempty=True, dtrajs=None):
dtrajs = ensure_dtraj_list(dtrajs)
states = self.prior.states_largest(strong=strong, mincount_connectivity=mincount_connectivity)
obs = self.prior.nonempty_obs(dtrajs) if observe_nonempty else None
return self.submodel(states=states, obs=obs, mincount_connectivity=mincount_connectivity)

def submodel_populous(self, strong=True, mincount_connectivity='1/n', observe_nonempty=True, dtrajs=None):
dtrajs = ensure_dtraj_list(dtrajs)
states = self.prior.states_populous(strong=strong, mincount_connectivity=mincount_connectivity)
obs = self.prior.nonempty_obs(dtrajs) if observe_nonempty else None
return self.submodel(states=states, obs=obs, mincount_connectivity=mincount_connectivity)

def submodel(self, states=None, obs=None, mincount_connectivity='1/n'):
# restrict prior
sub_model = self.prior.submodel(states=states, obs=obs,
mincount_connectivity=mincount_connectivity)
# restrict reduce samples
count_model = sub_model.count_model
subsamples = [sample.submodel(states=count_model.active_set, obs=count_model.observable_set)
for sample in self]
return BayesianPosterior(sub_model, subsamples)


class QuantityStatistics(Model):
""" Container for statistical quantities computed on samples.
Expand Down Expand Up @@ -184,7 +182,8 @@ def __init__(self, samples: typing.List[np.ndarray], quantity, store_samples=Fal
self.R *= unit


def score_cv(estimator: _MSMBaseEstimator, dtrajs, n=10, score_method='VAMP2', score_k=10, random_state=None):
def score_cv(estimator: _MSMBaseEstimator, dtrajs, lagtime, n=10, count_mode="sliding", score_method='VAMP2',
score_k=10, random_state=None):
r""" Scores the MSM using the variational approach for Markov processes [1]_ [2]_ and cross-validation [3]_ .

Divides the data into training and test data, fits a MSM using the training
Expand All @@ -200,11 +199,16 @@ def score_cv(estimator: _MSMBaseEstimator, dtrajs, n=10, score_method='VAMP2', s
----------
estimator : MSMBaseEstimator like
estimator to produce models for CV.
dtrajs : list of arrays
dtrajs : list of array_like
Test data (discrete trajectories).
lagtime : int
lag time
n : number of samples
Number of repetitions of the cross-validation. Use large n to get solid
means of the score.
count_mode : str, optional, default='sliding'
counting mode of count matrix estimator, if sliding the trajectory is split in a sliding window fashion.
Supports 'sliding' and 'sample'.
score_method : str, optional, default='VAMP2'
Overwrite scoring method to be used if desired. If `None`, the estimators scoring
method will be used.
Expand Down Expand Up @@ -232,17 +236,19 @@ def score_cv(estimator: _MSMBaseEstimator, dtrajs, n=10, score_method='VAMP2', s
dynamics simulation. J. Chem. Theory Comput. 11, 5002-5011 (2015).

"""
from sktime.markovprocess import TransitionCountEstimator
from sktime.util import ensure_dtraj_list
dtrajs = ensure_dtraj_list(dtrajs) # ensure format
if estimator.count_mode not in ('sliding', 'sample'):
if count_mode not in ('sliding', 'sample'):
raise ValueError('score_cv currently only supports count modes "sliding" and "sample"')
sliding = estimator.count_mode == 'sliding'
sliding = count_mode == 'sliding'
scores = []
for fold in range(n):
dtrajs_split = blocksplit_dtrajs(dtrajs, lag=estimator.lagtime, sliding=sliding, random_state=random_state)
dtrajs_split = blocksplit_dtrajs(dtrajs, lag=lagtime, sliding=sliding, random_state=random_state)
dtrajs_train, dtrajs_test = cvsplit_dtrajs(dtrajs_split, random_state=random_state)
model = estimator.fit(dtrajs_train).fetch_model()

cc = TransitionCountEstimator(lagtime, count_mode).fit(dtrajs_train).fetch_model().submodel_largest()
model = estimator.fit(cc).fetch_model()
s = model.score(dtrajs_test, score_method=score_method, score_k=score_k)
scores.append(s)
return np.array(scores)

Loading