In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from spectral_dagger.spectral.hankel import single_obs_basis
from spectral_dagger.utils.math import normalize
from spectral_dagger.hmm import HMM
from spectral_dagger.spectral import SpectralPSR

In [2]:
model_seed = 1
data_seed = np.random.randint(10000)
np.random.seed(model_seed)

n_states = 5
n_obs = 6

observations = range(n_obs)
states = range(n_states)

O = np.random.binomial(1, 0.5, (n_states, n_obs))
for row in O:
    if sum(row) == 0:
        row[:] = 1.0

O = normalize(O, ord=1, conservative=True)

T = np.random.binomial(1, 0.5, (n_states, n_states))
for row in T:
    if sum(row) == 0:
        row[:] = 1.0

T = normalize(T, ord=1, conservative=True)

init_dist = normalize(np.ones(n_states), ord=1, conservative=True)

hmm = HMM(observations, states, T, O, init_dist)

np.random.seed(data_seed)
print(hmm.sample_trajectory(10))

horizon = 3
test_samples = [hmm.sample_trajectory(horizon) for i in range(1000)]
results = []

estimators = ['string', 'prefix', 'substring', 'single']
sizes = np.linspace(1000, 10000, 5).astype('i')
dimensions = np.linspace(n_states, 2 * n_states, 5).astype('i')

for estimator in estimators:
    for m in dimensions:
        for n_samples in sizes:

            print "*" * 80
            samples = [hmm.sample_trajectory(horizon) for i in range(n_samples)]
            psr = SpectralPSR(hmm.observations)

            if m is None:
                m = hmm.n_states

            basis = None
            if estimator == 'single':
                basis = single_obs_basis(hmm.observations, True)
                estimator = 'substring'

            psr.fit(samples, m, estimator, basis=basis)

            llh = psr.get_log_likelihood(test_samples, base=2)
            print "$" * 80
            print "n_samples: ", n_samples
            perplexity = 2**(-llh)
            print "Perplexity: ", perplexity

            results.append(dict(
                n_samples=n_samples, estimator=estimator,
                dimension=m, perplexity=perplexity))

results = pd.DataFrame.from_records(results)

[3, 2, 3, 3, 3, 3, 1, 5, 3, 5]
********************************************************************************
Generating basis...
Estimating Hankels...
Performing SVD...
Computing operators...
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
n_samples:  1000
Perplexity:  56.6468097089
********************************************************************************
Generating basis...
Estimating Hankels...
Performing SVD...
Computing operators...
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
n_samples:  3250
Perplexity:  55.6792379784
********************************************************************************
Generating basis...
Estimating Hankels...
Performing SVD...
Computing operators...
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
n_samples:  5500
Perplexity:  55.6453636049
********************************************************************************
Generating basis...
E

In [9]:
for estimator in estimators:
    for m in dimensions:
        data = results[
            (results['estimator'] == estimator)
            & (results['dimension'] == m)]
        linestyle = {'string': '-', 'prefix': '-.', 'substring': '--', 'single': ':'}[estimator]
        plt.plot(
            data['n_samples'], data['perplexity'], linestyle=linestyle, label='%s, Dim=%d' % (estimator, m))

plt.ylim((45, 80))
plt.legend(
    loc='center left', bbox_to_anchor=(1.05, 0.5), prop={'size': 10},
    handlelength=2.0, handletextpad=.5, shadow=False, frameon=False)
plt.gcf().subplots_adjust(wspace=0.3, left=0.1, right=0.75, bottom=0.15)

plt.show()