In [1]:
import markovify
from tqdm import tqdm
from random import randint
from sklearn.model_selection import train_test_split
import evaluate

In [2]:
with open("./lotr.txt", "r", encoding='utf-8') as file:
    text = file.read()

data = markovify.Text(text).parsed_sentences

train, test = train_test_split(data, train_size=0.9)
train_text = " ".join([" ".join(sts) for sts in train])

print(f"Length of traingin text: {len(train_text)}")   
print(f"Sentences:\n    In train: {len(train)}\n    In test:  {len(test)}")

models = [None, None]
MIN_SIZE = 2
MAX_SIZE = 5

for size in tqdm(range(MIN_SIZE, MAX_SIZE + 1)):
    models.append(markovify.Text(train_text, state_size=size))


Length of traingin text: 1904289
Sentences:
    In train: 22460
    In test:  2496


100%|██████████| 4/4 [00:03<00:00,  1.31it/s]


In [3]:
# import sentence transformers
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the BERT model. Various models trained on Natural Language Inference (NLI)

model = SentenceTransformer('bert-base-nli-mean-tokens')

def get_similarity(text1, text2):
    embeddings1 = model.encode(text1, convert_to_tensor=True).cpu().reshape(1, -1)
    embeddings2 = model.encode(text2, convert_to_tensor=True).cpu().reshape(1, -1)
    cosine_scores = cosine_similarity(embeddings1, embeddings2)
    return cosine_scores[0][0]

# example

print(get_similarity("I like apples", "I do not like apples"))

0.47659826


In [13]:
MAX_GENERATIONS_ATTEMPTS = 10

results = [[], []]
references = [[], []]

number_of_words_as_start = 3

for size in range(MIN_SIZE, MAX_SIZE + 1):
    print(f"====================== Chain size: {size} ======================")
    results.append([])
    references.append([])
    for ref in tqdm(test):
        if len(ref) < number_of_words_as_start:
            continue
        for _ in range(MAX_GENERATIONS_ATTEMPTS):
            try:
                sentence = models[size].make_sentence_with_start(" ".join(ref[:number_of_words_as_start]))
                if sentence:
                    results[size].append(sentence)
                    references[size].append(" ".join(ref))
                break
            except:
                continue
    
    scores = []

    for (ref, result) in tqdm(list(zip(references[size], results[size]))):
        scores.append(get_similarity(ref, result))
    
    # mean 
    score = sum(scores) / len(scores)
    
    print(f"Successful generations: {len(results[-1])}")
    print(f"Similarity score: {score}")
            



100%|██████████| 2496/2496 [00:10<00:00, 228.26it/s]
100%|██████████| 1631/1631 [00:44<00:00, 36.53it/s]


Successful generations: 1631
Similarity score: 0.39108193433169186


100%|██████████| 2496/2496 [00:17<00:00, 138.97it/s]
100%|██████████| 1451/1451 [01:14<00:00, 19.42it/s]


Successful generations: 1451
Similarity score: 0.38866925362669585


100%|██████████| 2496/2496 [00:47<00:00, 52.79it/s]
100%|██████████| 1005/1005 [01:01<00:00, 16.29it/s]


Successful generations: 1005
Similarity score: 0.37243771057260866


100%|██████████| 2496/2496 [01:06<00:00, 37.26it/s]
100%|██████████| 283/283 [00:13<00:00, 20.30it/s]

Successful generations: 283
Similarity score: 0.3577421232399797





In [14]:
model_size = 5

for result, reference in zip(results[model_size], references[model_size]):
    print("===================================================================================================")
    print(f"Reference: {reference}\nGenerated: {result}")
    print(f"Similarity score: {get_similarity(reference, result)}")

Reference: But as for throwing it away, that was obviously wrong.
Generated: But as they drew near to the borders of the Road.
Similarity score: 0.31715232133865356
Reference: I have no doubt that Smjagol's grandmother was a matriarch, a great ####-person in her way, but to talk of her possessing many Elven-rings was absurd, and as for giving them away, it was a lie.
Generated: I have thought of a nice ending for it: and he lived happily ever afterwards to the end of his days.
Similarity score: 0.27375468611717224
Reference: He had a good deal to think about.
Generated: He had a long white beard and bushy eyebrows that stuck out further than the brim of his shady hat.
Similarity score: 0.3647698163986206
Reference: They were less than a day's ride from the out-walls of Minas Tirith that encircled the townlands.
Generated: They were sick and weary, and they could not go much further, if the snow continued.
Similarity score: 0.39012575149536133
Reference: For a moment Faramir's restraint