In [1]:
#pip install mchmm tokenizers datasets ipywidgets

In [2]:
from itertools import islice
import datasets

dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train")

def dataset_iterator(batch_size=1000):
    tok_dataset = dataset.select_columns("text")
    diter = tok_dataset.iter(batch_size)
    for batch in islice(diter, 1000):
        yield batch["text"]

In [3]:
import string
import mchmm as mc
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from transformers import logging

logging.enable_progress_bar()
logging.set_verbosity_info()

def hmm_train_data_iter(dataiter, tokenizer, max_batch=10):
    for e in islice(dataiter, max_batch):
        for txt in e:
            for token in tokenizer.encode(txt).tokens:
                yield token
def hmm_train_data(dataiter, tokenizer, max_length=1000, max_batch=10):
    return list(islice(hmm_train_data_iter(dataiter, tokenizer, max_batch), max_length))

def hmm_ascii(max_batch=100, max_length=5000):
    alphabet = [str(x) for x in string.printable]
    tokenizer = Tokenizer(BPE())
    tokenizer.add_tokens(alphabet)
    mctrain = hmm_train_data(dataset_iterator(), tokenizer, max_batch=max_batch, max_length=max_length)
    hmm = mc.MarkovChain().from_data(mctrain)
    return hmm, tokenizer

def run_hmm_train(ngram=1, max_batch=100, max_length=5000):
    alphabet = [str(x) for x in string.printable]
    trainer = BpeTrainer(max_token_length=ngram, show_progress=True, min_frequency=2, initial_alphabet=alphabet)
    tokenizer = Tokenizer(BPE())
    tokenizer.train_from_iterator(dataset_iterator(), trainer=trainer)
    mctrain = hmm_train_data(dataset_iterator(), tokenizer, max_batch=max_batch, max_length=max_length)
    hmm = mc.MarkovChain().from_data(mctrain)
    return hmm, tokenizer

def hmm_generate(hmm, n=100):
    _, states = hmm.simulate(n)
    return "".join(states)

In [None]:
# max_token_length does not seem to be respected, at least for ngram=1
hmm_1, tokenizer_1 = hmm_ascii()
hmm_5, tokenizer_5 = run_hmm_train(ngram=5)

In [5]:
tokenizer_1.get_vocab_size()

100

In [6]:
tokenizer_5.get_vocab_size()

30000

In [7]:
hmm_generate(hmm_1)

' find e outh 4 thropelky Vinth geriond n blthe fin anlo tache , wotid . thiraneniedelked . pldicanec'

In [8]:
hmm_generate(hmm_5)

"the Nameless are No.1 in that he deserent Valkyire , the first game some invaddition , attached . Multiple turn , but a fan rest enemy forces , along with returned to Imperform . In its predecessorporatonality in orded the public was faced by May 'n on the characters released of the battle system second overall "