deeptime-ml · clonker · Jan 24, 2020 · Jan 14, 2020 · Jan 14, 2020 · Jan 15, 2020
diff --git a/sktime/base.py b/sktime/base.py
@@ -5,22 +5,19 @@
 
 
 class _base_methods_mixin(object, metaclass=abc.ABCMeta):
-    """ defines common methods used by both Estimator and Model classes.
+    """ Defines common methods used by both Estimator and Model classes. These are mostly static and low-level
+    checking of conformity with respect to scikit-time conventions.
     """
 
     def __repr__(self):
         name = '{cls}-{id}:'.format(id=id(self), cls=self.__class__.__name__)
         return '{name}{params}]'.format(name=name,
-            params=pprint_sklearn(self.get_params(), offset=len(name), )
+                                        params=pprint_sklearn(self.get_params(), offset=len(name), )
         )
 
-    def get_params(self, deep=True):
-        """Get parameters of this kernel.
-        Parameters
-        ----------
-        deep : boolean, optional
-            If True, will return the parameters for this estimator and
-            contained subobjects that are estimators.
+    def get_params(self):
+        r"""Get parameters of this kernel.
+
         Returns
         -------
         params : mapping of string to any
@@ -140,9 +137,9 @@ def data(self):
 
     @data.setter
     def data(self, value_):
+        import numpy as np
         args, kwargs = value_
         # store data as a list of ndarrays
-        import numpy as np
         # handle optional y for supervised learning
         y = kwargs.get('y', None)
 
@@ -165,21 +162,27 @@ def data(self, value_):
                     self._data.append(x)
                 else:
                     raise InputFormatError(f'Invalid input element in position {i}, only numpy.ndarrays allowed.')
+        elif isinstance(value, Model):
+            self._data.append(value)
         else:
-            raise InputFormatError(f'Only ndarray or list/tuple of ndarray allowed. But was of type {type(value)}'
-                                   f' and looks like {value}.')
+            raise InputFormatError(f'Only model, ndarray or list/tuple of ndarray allowed. '
+                                   f'But was of type {type(value)}: {value}.')
 
     def __enter__(self):
+        import numpy as np
         self.old_writable_flags = []
-        for array in self.data:
-            self.old_writable_flags.append(array.flags.writeable)
-            # set ndarray writabe flags to false
-            array.flags.writeable = False
+        for d in self.data:
+            if isinstance(d, np.ndarray):
+                self.old_writable_flags.append(d.flags.writeable)
+                # set ndarray writabe flags to false
+                d.flags.writeable = False
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         # restore ndarray writable flags to old state
-        for array, writable in zip(self.data, self.old_writable_flags):
-            array.flags.writeable = writable
+        import numpy as np
+        for d, writable in zip(self.data, self.old_writable_flags):
+            if isinstance(d, np.ndarray):
+                d.flags.writeable = writable
 
     def __call__(self, *args, **kwargs):
         # extract input data from args, **kwargs (namely x and y)

diff --git a/sktime/clustering/kmeans.py b/sktime/clustering/kmeans.py
@@ -85,7 +85,7 @@ def __init__(self, n_clusters, max_iter=5, metric=None,
             This is used to resume the kmeans iteration. Note, that if this is set, the init_strategy is ignored and
             the centers are directly passed to the kmeans iteration algorithm.
         """
-
+        super(KmeansClustering, self).__init__()
         if n_jobs is None:
             # todo: sensible choice?
             # todo in sklearn: None -> 1 job, -1 -> all cpus (logical)
@@ -105,8 +105,6 @@ def __init__(self, n_clusters, max_iter=5, metric=None,
         self.n_jobs = n_jobs
         self.initial_centers = initial_centers
 
-        super(KmeansClustering, self).__init__()
-
     def fetch_model(self) -> KMeansClusteringModel:
         return self._model
 

diff --git a/sktime/data/double_well.py b/sktime/data/double_well.py
@@ -28,7 +28,7 @@ class DoubleWellDiscrete(object):
     def __init__(self):
         dtraj, msm = _load_double_well_discrete()
         self._dtraj = dtraj
-        self._msm = msm
+        self._analytic_msm = msm
 
     @property
     def dtraj(self):
@@ -65,12 +65,12 @@ def dtraj_n(self, divides):
     @property
     def transition_matrix(self):
         """ Exact transition matrix used to generate the data """
-        return self.msm.transition_matrix
+        return self.analytic_msm.transition_matrix
 
     @property
-    def msm(self):
+    def analytic_msm(self):
         """ Returns an MSM object with the exact transition matrix """
-        return self._msm
+        return self._analytic_msm
 
     def simulate_trajectory(self, n_steps, start=None, stop=None, dt=1) -> _np.ndarray:
         """
@@ -85,7 +85,7 @@ def simulate_trajectory(self, n_steps, start=None, stop=None, dt=1) -> _np.ndarr
         -------
         a discrete trajectory
         """
-        return self.msm.simulate(n_steps, start=start, stop=stop, dt=dt)
+        return self.analytic_msm.simulate(n_steps, start=start, stop=stop, dt=dt)
 
     def simulate_trajectories(self, n_trajectories: int, n_steps: int,
                               start=None, stop=None, dt=1) -> List[_np.ndarray]:

diff --git a/sktime/markovprocess/__init__.py b/sktime/markovprocess/__init__.py
@@ -15,9 +15,8 @@
 from .maximum_likelihood_msm import MaximumLikelihoodMSM
 from .bayesian_msm import BayesianMSM
 from .pcca import pcca
+from .transition_counting import TransitionCountEstimator, TransitionCountModel
 
 from .reactive_flux import ReactiveFlux
 
 from ._base import score_cv
-from .chapman_kolmogorov_validator import cktest
-
diff --git a/sktime/markovprocess/_base.py b/sktime/markovprocess/_base.py
@@ -4,87 +4,107 @@
 
 from sktime.base import Estimator, Model
 from sktime.markovprocess import MarkovStateModel
-from sktime.markovprocess.transition_counting import blocksplit_dtrajs, cvsplit_dtrajs
-# TODO: we do not need this anymore!
 from sktime.util import confidence_interval, ensure_dtraj_list
 
 
-class _MSMBaseEstimator(Estimator):
-    r"""Maximum likelihood estimator for MSMs given discrete trajectory statistics
+def blocksplit_dtrajs(dtrajs, lag=1, sliding=True, shift=None, random_state=None):
+    """ Splits the discrete trajectories into approximately uncorrelated fragments
+
+    Will split trajectories into fragments of lengths lag or longer. These fragments
+    are overlapping in order to conserve the transition counts at given lag.
+    If sliding=True, the resulting trajectories will lead to exactly the same count
+    matrix as when counted from dtrajs. If sliding=False (sampling at lag), the
+    count matrices are only equal when also setting shift=0.
 
     Parameters
     ----------
+    dtrajs : list of ndarray(int)
+        Discrete trajectories
     lag : int
-        lag time at which transitions are counted and the transition matrix is
-        estimated.
+        Lag time at which counting will be done. If sh
+    sliding : bool
+        True for splitting trajectories for sliding count, False if lag-sampling will be applied
+    shift : None or int
+        Start of first full tau-window. If None, shift will be randomly generated
 
-    reversible : bool, optional, default = True
-        If true compute reversible MarkovStateModel, else non-reversible MarkovStateModel
+    """
+    from sklearn.utils.random import check_random_state
+    dtrajs_new = []
+    random_state = check_random_state(random_state)
+    for dtraj in dtrajs:
+        if len(dtraj) <= lag:
+            continue
+        if shift is None:
+            s = random_state.randint(min(lag, dtraj.size - lag))
+        else:
+            s = shift
+        if sliding:
+            if s > 0:
+                dtrajs_new.append(dtraj[0:lag + s])
+            for t0 in range(s, dtraj.size - lag, lag):
+                dtrajs_new.append(dtraj[t0:t0 + 2 * lag])
+        else:
+            for t0 in range(s, dtraj.size - lag, lag):
+                dtrajs_new.append(dtraj[t0:t0 + lag + 1])
+    return dtrajs_new
 
-    count_mode : str, optional, default='sliding'
-        mode to obtain count matrices from discrete trajectories. Should be
-        one of:
 
-        * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts
-          at time indexes
+def cvsplit_dtrajs(dtrajs, random_state=None):
+    """ Splits the trajectories into a training and test set with approximately equal number of trajectories
 
-          .. math::
+    Parameters
+    ----------
+    dtrajs : list of ndarray(int)
+        Discrete trajectories
 
-             (0 \rightarrow \tau), (1 \rightarrow \tau+1), ..., (T-\tau-1 \rightarrow T-1)
+    """
+    from sklearn.utils.random import check_random_state
+    if len(dtrajs) == 1:
+        raise ValueError('Only have a single trajectory. Cannot be split into train and test set')
+    random_state = check_random_state(random_state)
+    I0 = random_state.choice(len(dtrajs), int(len(dtrajs) / 2), replace=False)
+    I1 = np.array(list(set(list(np.arange(len(dtrajs)))) - set(list(I0))))
+    dtrajs_train = [dtrajs[i] for i in I0]
+    dtrajs_test = [dtrajs[i] for i in I1]
+    return dtrajs_train, dtrajs_test
 
-        * 'effective' : Uses an estimate of the transition counts that are
-          statistically uncorrelated. Recommended when used with a
-          Bayesian MarkovStateModel.
-        * 'sample' : A trajectory of length T will have :math:`T/\tau` counts
-          at time indexes
 
-          .. math::
+class _MSMBaseEstimator(Estimator):
+    r"""Maximum likelihood estimator for MSMs given discrete trajectory statistics
 
-                (0 \rightarrow \tau), (\tau \rightarrow 2 \tau), ..., (((T/\tau)-1) \tau \rightarrow T)
+    Parameters
+    ----------
+    reversible : bool, optional, default = True
+        If true compute reversible MarkovStateModel, else non-reversible MarkovStateModel
 
     sparse : bool, optional, default = False
         If true compute count matrix, transition matrix and all derived
         quantities using sparse matrix algebra. In this case python sparse
         matrices will be returned by the corresponding functions instead of
         numpy arrays. This behavior is suggested for very large numbers of
         states (e.g. > 4000) because it is likely to be much more efficient.
-
-    dt_traj : str, optional, default='1 step'
-        Description of the physical time of the input trajectories. May be used
-        by analysis algorithms such as plotting tools to pretty-print the axes.
-        By default '1 step', i.e. there is no physical time unit. Specify by a
-        number, whitespace and unit. Permitted units are (* is an arbitrary
-        string). E.g. 200 picoseconds or 200ps.
-
-    mincount_connectivity : float or '1/n'
-        minimum number of counts to consider a connection between two states.
-        Counts lower than that will count zero in the connectivity check and
-        may thus separate the resulting transition matrix. The default
-        evaluates to 1/n_states.
-
     """
 
-    def __init__(self, lagtime=1, reversible=True, count_mode='sliding', sparse=False,
-                 dt_traj='1 step', mincount_connectivity='1/n'):
+    def __init__(self, reversible=True, sparse=False):
         super(_MSMBaseEstimator, self).__init__()
-        self.lagtime = lagtime
-
-        # set basic parameters
         self.reversible = reversible
-
-        # sparse matrix computation wanted?
         self.sparse = sparse
 
-        # store counting mode (lowercase)
-        self.count_mode = count_mode
-        if self.count_mode not in ('sliding', 'effective', 'sample'):
-            raise ValueError('count mode ' + count_mode + ' is unknown.')
+    @property
+    def reversible(self) -> bool:
+        return self._reversible
 
-        # time step
-        self.dt_traj = dt_traj
+    @reversible.setter
+    def reversible(self, value: bool):
+        self._reversible = value
 
-        # connectivity
-        self.mincount_connectivity = mincount_connectivity
+    @property
+    def sparse(self) -> bool:
+        return self._sparse
+
+    @sparse.setter
+    def sparse(self, value: bool):
+        self._sparse = value
 
 
 class BayesianPosterior(Model):
@@ -115,28 +135,6 @@ def gather_stats(self, quantity, store_samples=False, *args, **kwargs):
         samples = [call_member(s, quantity, *args, **kwargs) for s in self]
         return QuantityStatistics(samples, quantity=quantity, store_samples=store_samples)
 
-    def submodel_largest(self, strong=True, mincount_connectivity='1/n', observe_nonempty=True, dtrajs=None):
-        dtrajs = ensure_dtraj_list(dtrajs)
-        states = self.prior.states_largest(strong=strong, mincount_connectivity=mincount_connectivity)
-        obs = self.prior.nonempty_obs(dtrajs) if observe_nonempty else None
-        return self.submodel(states=states, obs=obs, mincount_connectivity=mincount_connectivity)
-
-    def submodel_populous(self, strong=True, mincount_connectivity='1/n', observe_nonempty=True, dtrajs=None):
-        dtrajs = ensure_dtraj_list(dtrajs)
-        states = self.prior.states_populous(strong=strong, mincount_connectivity=mincount_connectivity)
-        obs = self.prior.nonempty_obs(dtrajs) if observe_nonempty else None
-        return self.submodel(states=states, obs=obs, mincount_connectivity=mincount_connectivity)
-
-    def submodel(self, states=None, obs=None, mincount_connectivity='1/n'):
-        # restrict prior
-        sub_model = self.prior.submodel(states=states, obs=obs,
-                                        mincount_connectivity=mincount_connectivity)
-        # restrict reduce samples
-        count_model = sub_model.count_model
-        subsamples = [sample.submodel(states=count_model.active_set, obs=count_model.observable_set)
-                      for sample in self]
-        return BayesianPosterior(sub_model, subsamples)
-
 
 class QuantityStatistics(Model):
     """ Container for statistical quantities computed on samples.
@@ -184,7 +182,8 @@ def __init__(self, samples: typing.List[np.ndarray], quantity, store_samples=Fal
             self.R *= unit
 
 
-def score_cv(estimator: _MSMBaseEstimator, dtrajs, n=10, score_method='VAMP2', score_k=10, random_state=None):
+def score_cv(estimator: _MSMBaseEstimator, dtrajs, lagtime, n=10, count_mode="sliding", score_method='VAMP2',
+             score_k=10, random_state=None):
     r""" Scores the MSM using the variational approach for Markov processes [1]_ [2]_ and cross-validation [3]_ .
 
     Divides the data into training and test data, fits a MSM using the training
@@ -200,11 +199,16 @@ def score_cv(estimator: _MSMBaseEstimator, dtrajs, n=10, score_method='VAMP2', s
     ----------
     estimator : MSMBaseEstimator like
         estimator to produce models for CV.
-    dtrajs : list of arrays
+    dtrajs : list of array_like
         Test data (discrete trajectories).
+    lagtime : int
+        lag time
     n : number of samples
         Number of repetitions of the cross-validation. Use large n to get solid
         means of the score.
+    count_mode : str, optional, default='sliding'
+        counting mode of count matrix estimator, if sliding the trajectory is split in a sliding window fashion.
+        Supports 'sliding' and 'sample'.
     score_method : str, optional, default='VAMP2'
         Overwrite scoring method to be used if desired. If `None`, the estimators scoring
         method will be used.
@@ -232,17 +236,19 @@ def score_cv(estimator: _MSMBaseEstimator, dtrajs, n=10, score_method='VAMP2', s
         dynamics simulation. J. Chem. Theory Comput. 11, 5002-5011 (2015).
 
     """
+    from sktime.markovprocess import TransitionCountEstimator
     from sktime.util import ensure_dtraj_list
     dtrajs = ensure_dtraj_list(dtrajs)  # ensure format
-    if estimator.count_mode not in ('sliding', 'sample'):
+    if count_mode not in ('sliding', 'sample'):
         raise ValueError('score_cv currently only supports count modes "sliding" and "sample"')
-    sliding = estimator.count_mode == 'sliding'
+    sliding = count_mode == 'sliding'
     scores = []
     for fold in range(n):
-        dtrajs_split = blocksplit_dtrajs(dtrajs, lag=estimator.lagtime, sliding=sliding, random_state=random_state)
+        dtrajs_split = blocksplit_dtrajs(dtrajs, lag=lagtime, sliding=sliding, random_state=random_state)
         dtrajs_train, dtrajs_test = cvsplit_dtrajs(dtrajs_split, random_state=random_state)
-        model = estimator.fit(dtrajs_train).fetch_model()
+
+        cc = TransitionCountEstimator(lagtime, count_mode).fit(dtrajs_train).fetch_model().submodel_largest()
+        model = estimator.fit(cc).fetch_model()
         s = model.score(dtrajs_test, score_method=score_method, score_k=score_k)
         scores.append(s)
     return np.array(scores)
-