In [1]:
from collections.abc import Iterator

def word_generator(
    possible_observations,
    maxlen
) -> Iterator[str]:
    words = ['']
    
    for wlen in range(1, maxlen + 1):
        # Save reference for which words already exist
        cur_nwords = len(words)
    
        for idx, word in enumerate(words):
            # Iterate through words that existed at the start of first loop
            if idx >= cur_nwords:
                break

            for obs in possible_observations:
                # Append another observation
                new_word = word + obs
                if new_word in words:
                    continue
                
                yield new_word

In [139]:
import re
import itertools

import numpy as np

In [459]:
def count_appearences(
    observation,
    *subwords
) -> int:
    word = "".join(subwords)
    pattern = "(?=(" + word + "))"
    count = len(re.findall(pattern, sbar))
    return count


def f_estimate(
    observation,
    *subwords
):
    word = "".join(subwords)
    count = count_appearences(observation, word)
    return count / (len(observation) - len(word) + 1)


def get_matrix_estimates(
    observation,
    possible_observations,
    chr_w,
    ind_w,
) -> np.array:
    F_0J = np.asmatrix(np.zeros([1, len(ind_w)]))
    F_I0 = np.asmatrix(np.zeros([len(chr_w), 1]))
    F_IJ = np.asmatrix(np.zeros([len(chr_w), len(ind_w)]))
    F_IzJ = [np.asmatrix(np.zeros([len(chr_w), len(ind_w)])) for _ in possible_observations]
    
    for cidx, cword in enumerate(chr_w):
        # Compute one
        F_I0[cidx, 0] = f_estimate(observation, cword)
        
        for iidx, iword in enumerate(ind_w):
            # Compute F_IJ[cidx, iidx]
            F_IJ[cidx, iidx] = f_estimate(observation, iword, cword)
    
            # Compute F_IzJ[cidx, iidx] for z in possible_observations
            for zidx, z in enumerate(possible_observations):
                F_IzJ[zidx][cidx, iidx] = f_estimate(observation, iword, z, cword)
    
    for iidx, iword in enumerate(ind_w):
        # Compute other
        F_0J[0, iidx] = f_estimate(observation, iword)
    
    return F_0J, F_I0, F_IJ, F_IzJ

from sklearn.utils.extmath import randomized_svd
def get_CQ_by_svd(
    estimated_matrix,
    target_dimension
):
    F = estimated_matrix
    print_shapes(F)
    print()
    
    U, S, Vt = randomized_svd(F, n_components = target_dimension)
    U = np.asmatrix(U)
    S = np.asmatrix(np.diag(S))
    Vt = np.asmatrix(Vt)
    print_shapes(U, S, Vt)
    print()
    print_shapes(Vt.T, np.linalg.pinv(S))
    
    C = U.T
    Q = Vt.T * np.linalg.pinv(S)

    # return np.asmatrix(U), np.diag(S), np.asmatrix(Vh)
    return C, Q

In [460]:
def print_shapes(*mats):
    for M in mats:
        print(f"({M.shape[0]} x {M.shape[1]})", end = " ")

In [461]:
def estimate_OOM(
    observation,
    possible_observations,
    chr_w,
    ind_w,
    target_dimension
):
    F_0J, F_I0, F_IJ, F_IzJ = get_matrix_estimates(observation, possible_observations, chr_w, ind_w)
    
    C, Q = get_CQ_by_svd(F_IJ, target_dimension)
    
    V = C * F_IJ * Q
    V_inv = np.linalg.inv(V)

    # Get linear functional
    sigma = F_0J * Q * V_inv

    # Get observable operators
    tau_z = []
    for F_IkJ in F_IzJ:
        tau_z.append(C * F_IkJ * Q * V_inv)
    
    # Get state vector
    omega = C * F_I0
    
    return (sigma, tau_z, omega), F_IJ

In [492]:
sbar = "abbbaaaabaabbbabbbbb"
Sigma = ["a", "b"]

characteristic = ["a", "b", "ab", "baa", "bbbb", "abb"]
indicative = ["a", "b", "ab", "bba", "aba", "abb", "baba"]

d = 3

(s, tau, w), F_IJ = estimate_OOM(sbar, Sigma, characteristic, indicative, d)

(6 x 7) 
(6 x 3) (3 x 3) (3 x 7) 
(7 x 3) (3 x 3) (3 x 6) (6 x 7) (7 x 3) 
(V)

(1 x 7) (7 x 3) (3 x 3) 
(sigma)



In [493]:
s

matrix([[1.26559643, 0.10834783, 0.06432487]])

In [494]:
tau

[matrix([[ 0.31516352,  0.50808543,  0.00149854],
         [ 0.21045622, -0.13676364,  0.73258352],
         [-0.00088359, -0.11535249, -0.09024112]]),
 matrix([[ 0.67045282, -0.28491454,  0.33098367],
         [-0.04495364, -0.31113318, -1.06394449],
         [-0.05085379,  0.40300997,  0.31762084]])]

In [495]:
w

matrix([[ 0.77235124],
        [ 0.14067197],
        [-0.04106427]])