In [1]:
from matplotlib.collections import LineCollection
from matplotlib import pyplot as plt
from matplotlib import cm, colors
from sklearn.manifold import TSNE, MDS
from sklearn.decomposition import PCA
from hmmlearn import hmm
from tqdm import tqdm
import seaborn as sns
import pandas as pd
import numpy as np

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

np.set_printoptions(linewidth=300, suppress=True)
pd.options.mode.chained_assignment = None  # default='warn'

from src.experiments.util_experiments import *
from src.oom import DiscreteValuedOOM, ContinuousValuedOOM
from src.oom.discrete_observable import DiscreteObservable
from src.oom.util.few_step_prediction import quantify_distribution, kl_divergence, fix_pvec
from src.oom.util.random_sparse import _generate_sparse_full_rank_matrix, _generate_observable_compound
from src.oom.util.learning_discrete import estimate_matrices_discrete_fixed

In [7]:
def compute_cross_entropy(source_oom, sequence, F_YX, indexing):
    total_nll = 0.0
    valid_terms = 0
    
    for t in range(3, len(sequence)):
        history, next_symbol = sequence[t-3 : t], sequence[t]
        index = "".join([obs.uid for obs in history])
        col_idx = indexing.index(index)
        next_idx = source_oom.observables.index(next_symbol)
        
        # Get probability from F_YX
        prob = F_YX[col_idx, next_idx]
        
        # Avoid log(0) by adding a tiny epsilon (e.g., 1e-12) if necessary
        if prob == 0:
            prob = 1e-12  # Handle zero probabilities gracefully
        
        total_nll += -np.log2(prob)
        valid_terms += 1
    
    # Step 3: Compute average NLL (cross-entropy)
    H_fq3 = total_nll / valid_terms
    return H_fq3

In [29]:
n = 3
d = 10
density = 0.4

results_batches = []

for _ in range(1):
    results = []
    
    for seed in tqdm(range(50)):
        oom_initforseed = DiscreteValuedOOM.from_sparse(alphabet_size=n, dimension=d, density=density, seed=seed)
        generation = oom_initforseed.generate(100_000)
        
        estimate_matrices_3rdorder = estimate_matrices_discrete_fixed(
            sequence=generation.sequence,
            len_cwords=1,
            len_iwords=3,
            indexing=True
        )
        indexing = list(estimate_matrices_3rdorder[-1])
        F_YX_3rdorder = estimate_matrices_3rdorder[0][0]
        F_YX_3rdorder /= F_YX_3rdorder.sum(axis=0)
        
        nll_3rdorder = compute_cross_entropy(oom_initforseed, generation.sequence, F_YX_3rdorder, indexing)
        result_thisseed = dict(n=n, d=d, sparsity=1-density, uniform=np.log2(n), H_q3=nll_3rdorder, H_f=generation.nll_list[-1])
        results.append(result_thisseed)
    
    res_1 = pd.DataFrame.from_records(results)
    res_1['H_q3 - H_f'] = res_1['H_q3'] - res_1['H_f']
    res_1["seed"] = res_1.index
    results_thisbatch = res_1.sort_values('H_q3 - H_f', ascending=False).reset_index(drop=True).iloc[0]
    
    results_thisbatch['H_f_long'] = DiscreteValuedOOM.from_sparse(
        alphabet_size=n, dimension=d, density=density,
        seed=int(results_thisbatch['seed'])
    ).generate(10_000_000).nll_list[-1]
    results_batches.append(results_thisbatch)

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [07:07<00:00,  8.55s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [07:06<00:00,  8.53s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [07:06<00:00,  8.54s/it]


In [31]:
pd.DataFrame(results_batches)

Unnamed: 0,n,d,sparsity,uniform,H_q3,H_f,H_q3 - H_f,seed,H_f_long
0,3.0,10.0,0.6,1.584963,10.87989,0.889746,9.990144,25.0,0.884897
0,3.0,10.0,0.6,1.584963,10.14393,0.886433,9.257497,25.0,0.884754
0,3.0,10.0,0.6,1.584963,10.240078,0.885592,9.354486,25.0,0.884945


In [32]:
n = 5
d = 10
density = 0.4

results_batches = []

for _ in range(1):
    results = []
    
    for seed in tqdm(range(50)):
        oom_initforseed = DiscreteValuedOOM.from_sparse(alphabet_size=n, dimension=d, density=density, seed=seed)
        generation = oom_initforseed.generate(100_000)
        
        estimate_matrices_3rdorder = estimate_matrices_discrete_fixed(
            sequence=generation.sequence,
            len_cwords=1,
            len_iwords=3,
            indexing=True
        )
        indexing = list(estimate_matrices_3rdorder[-1])
        F_YX_3rdorder = estimate_matrices_3rdorder[0][0]
        F_YX_3rdorder /= F_YX_3rdorder.sum(axis=0)
        
        nll_3rdorder = compute_cross_entropy(oom_initforseed, generation.sequence, F_YX_3rdorder, indexing)
        result_thisseed = dict(n=n, d=d, sparsity=1-density, uniform=np.log2(n), H_q3=nll_3rdorder, H_f=generation.nll_list[-1])
        results.append(result_thisseed)
    
    res_1 = pd.DataFrame.from_records(results)
    res_1['H_q3 - H_f'] = res_1['H_q3'] - res_1['H_f']
    res_1["seed"] = res_1.index
    results_thisbatch = res_1.sort_values('H_q3 - H_f', ascending=False).reset_index(drop=True).iloc[0]
    
    results_thisbatch['H_f_long'] = DiscreteValuedOOM.from_sparse(
        alphabet_size=n, dimension=d, density=density,
        seed=int(results_thisbatch['seed'])
    ).generate(10_000_000).nll_list[-1]
    results_batches.append(results_thisbatch)

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [07:34<00:00,  9.09s/it]


KeyboardInterrupt: 

In [None]:
results_batches

In [33]:
results_thisbatch

n              5.000000
d             10.000000
sparsity       0.600000
uniform        2.321928
H_q3          16.137141
H_f            1.538206
H_q3 - H_f    14.598935
seed          46.000000
Name: 0, dtype: float64