In [1]:
import markovify
from tqdm import tqdm
from random import randint
from sklearn.model_selection import train_test_split
import evaluate

In [2]:
with open("./lotr.txt", "r", encoding='utf-8') as file:
    text = file.read()

data = markovify.Text(text).parsed_sentences

train, test = train_test_split(data, train_size=0.9)
train_text = " ".join([" ".join(sts) for sts in train])

print(f"Length of traingin text: {len(train_text)}")   
print(f"Sentences:\n    In train: {len(train)}\n    In test:  {len(test)}")

models = [None, None]
MIN_SIZE = 2
MAX_SIZE = 5

for size in tqdm(range(MIN_SIZE, MAX_SIZE + 1)):
    models.append(markovify.Text(train_text, state_size=size))


Length of traingin text: 1908131
Sentences:
    In train: 22460
    In test:  2496


100%|██████████| 4/4 [00:03<00:00,  1.19it/s]


In [16]:
def get_uniqness_measure(sentence, n=5): # number of different n-grams
    ngrams = [sentence[i:i+n] for i in range(len(sentence)-n+1)]
    return len(set(ngrams)) / len(ngrams)

In [20]:
MAX_GENERATIONS_ATTEMPTS = 10

results = [[], []]
references = [[], []]

for size in range(MIN_SIZE, MAX_SIZE + 1):
    print(f"====================== Chain size: {size} ======================")
    results.append([])
    references.append([])
    for ref in tqdm(test):
        if len(ref) < 2:
            continue
        for _ in range(MAX_GENERATIONS_ATTEMPTS):
            try:
                sentence = models[size].make_sentence_with_start(f"{ref[0]} {ref[1]}")
                if sentence:
                    results[size].append(sentence)
                    references[size].append(" ".join(ref))
                break
            except:
                continue
    
    scores = []

    for (ref, result) in tqdm(list(zip(references[size], results[size]))):
        scores.append(get_uniqness_measure(result) / get_uniqness_measure(ref))
    
    # mean 
    score = sum(scores) / len(scores)
    
    print(f"Successful generations: {len(results[-1])}")
    print(f"Uniqness score: {score}")
            



100%|██████████| 2496/2496 [00:11<00:00, 224.30it/s]
100%|██████████| 1635/1635 [00:00<00:00, 39279.93it/s]


Successful generations: 1635
Uniqness score: 1.0040521361246943


100%|██████████| 2496/2496 [00:16<00:00, 150.01it/s]
100%|██████████| 1483/1483 [00:00<00:00, 31102.16it/s]


Successful generations: 1483
Uniqness score: 0.9991766548010568


100%|██████████| 2496/2496 [00:45<00:00, 55.18it/s]
100%|██████████| 1071/1071 [00:00<00:00, 30908.16it/s]


Successful generations: 1071
Uniqness score: 0.9959202130340875


100%|██████████| 2496/2496 [01:04<00:00, 38.81it/s]
100%|██████████| 279/279 [00:00<00:00, 41766.39it/s]

Successful generations: 279
Uniqness score: 0.9939975485222575





In [22]:
model_size = 5

for result, reference in zip(results[model_size], references[model_size]):
    print("===================================================================================================")
    print(f"Reference: {reference}\nGenerated: {result}")
    print(f"Uniqness score: {get_uniqness_measure(result) / get_uniqness_measure(ref)}")
    print("measure result:", get_uniqness_measure(result))
    print("measure reference:", get_uniqness_measure(ref))

Reference: There was another burst of song, and then suddenly, hopping and dancing along the path, there appeared above the reeds an old battered hat with a tall crown and a long blue feather stuck in the band.
Generated: There was another smaller door on the other side of the hearth he heard Bilbo's voice speaking.
Uniqness score: 1.0024038461538463
measure result: 0.9230769230769231
measure reference: 0.920863309352518
Reference: When Sam got back he found the whole village roused.
Generated: When Sam awoke, he found that he was trembling. said Legolas.
Uniqness score: 1.0859375
measure result: 1.0
measure reference: 0.920863309352518
Reference: There was a wide arch leading to a courtyard between the two wings, and on the left under the arch there was a large doorway reached by a few broad steps.
Generated: There was a pale figure hurrying away in and out of the shadows on the other side of the mountains.
Uniqness score: 1.028782894736842
measure result: 0.9473684210526315
measure r