## Import Libraries

In [1]:
using PyCall
using Megafauna
using DelimitedFiles
include("../src/SegmentDistances.jl")
# import python library for ADP clustering using PyCall
Data = pyimport("dadapy.data")
# import python library for Deeptime using PyCall
deeptime = pyimport("deeptime")

PyObject <module 'deeptime' from '/Users/dcg/.julia/conda/3/x86_64/lib/python3.12/site-packages/deeptime/__init__.py'>

## Set Deeptime variables for VAMP scoring

In [2]:
decomposition = deeptime.decomposition
VAMP = decomposition.VAMP
vamp_score = decomposition.vamp_score

PyObject <function vamp_score at 0x1b39abec0>

## Load data

In [3]:
X = readdlm("../data/prinz/prinz.txt")[:,1];

100000-element Vector{Float64}:
  0.0
  0.051143667354386524
  0.08600807236266525
  0.007153524256075693
  0.14186588177505005
  0.14685408069336753
  0.2535870479970487
  0.18941749180005957
  0.3047502923621745
  0.18713377631989392
  0.21610093725491342
  0.2072503732460562
  0.34310215731208693
  ⋮
 -0.7744094981718489
 -0.7757513160218188
 -0.8781407678458266
 -0.9277912881824416
 -0.8307554860495173
 -0.6648800452189343
 -0.6931762294352226
 -0.6701892664526988
 -0.5470280921110552
 -0.8015793197195583
 -0.8804054232794996
 -0.9271776014688877

## Set Adv. Density Peaks variables, create and score model based on cps

In [5]:
# given a quantile cutoff q, a windowsize w, and a chunksize T, by default 10000
# this function 
# 1. computes change points in the time series according to q,w
# 2. computes pairwise segment distances using W2 distance
# 3. uses Advanced density peaks clustering to identify segment cluster labels in an unsupervised manner
# 4. labels the individual points of the segment according to their segment label
# 5. uses deeptime to fit a MSM model and output a vamp2

function score_parameters(q, w, T=10000)
    changes = compute_change_points(X[1:T], q, w)
    dists = pairwise_segment_distances_1d(X[1:T], changes)
    data = Data.Data(distances=dists)
    Z = 1.65
    halo = false
    labels = data.compute_clustering_ADP(Z=Z, halo=halo)
    pt_labels = label_series(X, changes, labels)
    model = VAMP(lagtime=5).fit(pt_labels).fetch_model()
    score = vamp_score(model, 2)
    return score
end

score_parameters (generic function with 2 methods)

## Grid search to find optimal parameters

In [6]:
N = 5
T = 10000
scores = zeros(N, N)
qs = LinRange(0.25, 0.95, N)
ws = LinRange(25, 100, N)
for i=1:N, j=1:N
    try
        scores[i,j] = score_parameters(qs[i], trunc(Int, ws[j]), T)
    catch err
        continue
    end
end
println(maximum(scores))

enumerating change points
number of dimensions: 1
Number of segments = 387
Computing 74691 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:03[39mm


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 253
Computing 31878 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 191
Computing 18145 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 149
Computing 11026 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 121
Computing 7260 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 357
Computing 63546 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 243
Computing 29403 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 167
Computing 13861 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 141
Computing 9870 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 121
Computing 7260 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 301
Computing 45150 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 197
Computing 19306 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 149
Computing 11026 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 129
Computing 8256 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 99
Computing 4851 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 233
Computing 27028 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 139
Computing 9591 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 97
Computing 4656 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 81
Computing 3240 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 67
Computing 2211 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 47
Computing 1081 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 41
Computing 820 segment distances
finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 31
Computing 465 segment distances
finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 23
Computing 253 segment distances
finished computing distances
enumerating change points
number of dimensions: 1
Number of segments = 19
Computing 171 segment distances
finished computing distances
1.9811666404450707


In [15]:
i, j = argmax(scores)[1], argmax(scores)[2];
q = qs[i];
w = trunc(Int, ws[j]);
println("Best q: $(q) and w: $(w)")

Best q: 0.95 and w: 43


## Reproduce the best model according to the VAMP2 score found above

In [16]:
changes = compute_change_points(X, q, w)
dists = pairwise_segment_distances_1d(X, changes)
data = Data.Data(distances=dists)
Z = 1.65
halo = false
labels = data.compute_clustering_ADP(Z=Z, halo=halo)
pt_labels = label_series(X, changes, labels)
model = VAMP(lagtime=5).fit(pt_labels).fetch_model()
score = vamp_score(model, 2)

enumerating change points
number of dimensions: 1
Number of segments = 351
Computing 61425 segment distances


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:12[39m


finished computing distances


1.9643225694624755