In [11]:
pip install mchmm tokenizers datasets

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.16.4->tokenizers)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading aiohttp-3.12.15-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0

In [None]:
import datasets
from itertools import islice

dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train")
def wikitext_iterator(batch_size=1000):
    # Only keep the text column to avoid decoding the rest of the columns unnecessarily
    tok_dataset = dataset.select_columns("text")
    diter = tok_dataset.iter(batch_size)
    for batch in islice(diter, 1000):
        yield batch["text"]

In [4]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(max_token_length=5, show_progress=True, min_frequency=3)
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.train_from_iterator(wikitext_iterator(), trainer=trainer)







In [41]:
def hmm_train_data_iter(dataiter, tokenizer, max_batch=10):
    for e in islice(dataiter, max_batch):
        for txt in e:
            for token in tokenizer.encode(txt).tokens:
                yield token
def hmm_train_data(dataiter, tokenizer, max_length=1000, max_batch=10):
    return list(islice(hmm_train_data_iter(dataiter, tokenizer, max_batch), max_length))


In [37]:
list(hmm_train_data(wikitext_iterator(), tokenizer, max_length=10))

[' = ', 'Val', 'kyri', 'a Ch', 'ron', 'ic', 'les ', 'III ', '= \n', ' S']

In [None]:
import mchmm as mc
mctrain = hmm_train_data(wikitext_iterator(), tokenizer, max_batch=100, max_length=4000)
hmm = mc.MarkovChain().from_data(mctrain)

In [50]:
_, states = hmm.simulate(100)
"".join(states)

'the battlefield in Japanded to the battle Potentire squad 422 , also unvoiced text . The player orders . Ordern early unit " Calamity in Novements relating games , while : this escorted ment squad the battlefield map : once per echelons \' turns . Characters . After that he designer Raita Honjou , who seeks rejected in'

In [78]:
import string
import mchmm as mc
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer

def run_hmm_train(ngram=1):
    alphabet = [str(x) for x in string.printable]
    trainer = BpeTrainer(max_token_length=ngram, show_progress=True, min_frequency=2, initial_alphabet=alphabet)
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    tokenizer.train_from_iterator(wikitext_iterator(), trainer=trainer)
    mctrain = hmm_train_data(wikitext_iterator(), tokenizer, max_batch=100, max_length=4000)
    hmm = mc.MarkovChain().from_data(mctrain)
    return hmm, tokenizer

def hmm_generate(hmm, n=100):
    _, states = hmm.simulate(n)
    return "".join(states)

In [None]:
# max_token_length does not seem to be respected, at least for ngram=1
hmm_1, tokenizer_1 = run_hmm_train(ngram=1)
hmm_3, tokenizer_3 = run_hmm_train(ngram=3)
hmm_5, tokenizer_5 = run_hmm_train(ngram=5)












In [74]:
hmm_generate(hmm_1)

"e was Comediffer withe Relel tletasof cked se 's theird batter rease plauppengameirser The platials arcor Regh at offic book @-@ precria seleto gameare syso a desioffer"

In [75]:
hmm_generate(hmm_3)

"e Sect members Imperenals ' terson whifive gameplayers mation wher own ded at . \n Taking the pers servant th echele usignerson who withe playStake missiof th occurn Blits ple p"

In [76]:
hmm_generate(hmm_5)

'the theme of the battlefield map : once to the same times place needed by both her hics and exemplified , developed unless was release , who is , concept Irving . The Nameless " , whose real individual @-@ specific was reture , and each characters cause certain heroinal played off to activate skills that role in the '

In [81]:
tokenizer_1.get_vocab_size()

4928

In [67]:
hmm_5.states.tolist()

['\n',
 ' ',
 ' , ',
 ' . ',
 ' . A',
 ' . G',
 ' = ',
 ' A',
 ' As ',
 ' Con',
 ' It ',
 ' Maj',
 ' O',
 ' Par',
 ' S',
 ' Sak',
 ' Tro',
 '" ',
 '" , ',
 '" Al',
 '" Di',
 '" M',
 '" N',
 '" re',
 "' ",
 "'n ",
 "'s ",
 "'s l",
 '( ',
 '( H',
 ') , ',
 ', ',
 ', Al',
 ', D',
 ', G',
 ', H',
 ', R',
 ', a ',
 ', ad',
 ', al',
 ', ex',
 ', h',
 ', im',
 ', s',
 ', su',
 ', wh',
 '. ',
 '. \n',
 '. " ',
 '. A',
 '. A ',
 '. Af',
 '. Al',
 '. Ch',
 '. D',
 '. E',
 '. Em',
 '. H',
 '. I',
 '. Li',
 '. N',
 '. O',
 '. Or',
 '. R',
 '. T',
 '. Th',
 '. W',
 '.G',
 '0 , ',
 '1 ',
 '13 ',
 '2 , ',
 '201',
 '2nd ',
 '3 ',
 '3 , ',
 '3 . ',
 '3 : ',
 '4 ',
 '4 . ',
 '42',
 '7 ',
 ': ',
 ': on',
 ': th',
 ': wh',
 '; ',
 '= ',
 '= \n',
 '= = ',
 '@-@ ',
 'AN',
 'AS ',
 'Abil',
 'Ace ',
 'Acti',
 'Af',
 'Ar',
 'Az',
 'B',
 'Batt',
 'Bli',
 'C',
 'Cal',
 'Ch',
 'Char',
 'Comm',
 'Crow',
 'D',
 'Dev',
 'E',
 'Emp',
 'Eng',
 'Euro',
 'For',
 'Fuji',
 'G',
 'Gam',
 'Hi',
 'Hon',
 'I . ',
 'II',
 'II 

In [62]:
len(hmm_1.states)

316