In [37]:
from tokenizers import Tokenizer, models, trainers
import collections


def unigram(tokens):
    model = collections.defaultdict(lambda: 0.01)
    for f in tokens:
        model[f] += 1
    N = float(sum(model.values()))
    for word in model:
        model[word] = model[word] / N
    return model

def perplexity(testset, model):
    perplexity = 1
    N = len(testset)
    for i in range(80):
        word = testset[i]
        perplexity = perplexity * (1/model[word])
    perplexity = pow(perplexity, 1/float(N))
    return perplexity

def main():

    tokenizer = Tokenizer(models.Unigram())
    trainer = trainers.UnigramTrainer(vocab_size=25000)
    tokenizer.train(files=["titoli_per_morfessor.txt"], trainer=trainer)

    print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

    encoding = tokenizer.encode("Merhaba bu bir kelime")
    print("Encoded string: {}".format(encoding.tokens))

    decoded = tokenizer.decode(encoding.ids)
    print("Decoded string: {}".format(decoded))

    vocab = tokenizer.get_vocab()

    # Perplexity:
    # Tokenize training and test corpus:
    with open("titoli_per_morfessor.txt", encoding="utf-8") as f:
        train_contents = f.read()
    

    train_tokenized = tokenizer.encode(train_contents)
    test_tokenized = tokenizer.encode(test_contents)

    # Get unigram model to calculate perplexity
    model = unigram(train_tokenized.tokens)
    print('Perplexity:', perplexity(test_tokenized.tokens, model))

if __name__ == '__main__':
    main()



Trained vocab size: 25000
Encoded string: ['M', 'er', 'ha', 'ba', ' b', 'u', ' b', 'ir', ' ', 'ke', 'li', 'me']
Decoded string: M er ha ba  b u  b ir   ke li me


NameError: name 'test_contents' is not defined