# Markovify

This code is for training and testing the markov chain model for word recommendation

In [2]:
import random
import re

import markovify
import spacy

!python -m spacy download en_core_web_sm

2023-05-01 18:47:21.840622: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m272.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


Training the markov model based on our training dataset

In [7]:
file_name = "output"

with open(f"../data/{file_name}_train.txt") as f:
    text = f.read()

with open(f"../data/{file_name}_test.txt") as f:
    test_text = f.read().split("\n")

training_model = markovify.NewlineText(text, state_size=3)

Testing the initial markov model for correct prediction

In [8]:
correct = 0.0
total_runs = 5000
completed_runs = 0

for i in range(total_runs):
    rand = random.randint(0, len(test_text) - 180)
    test_sentence = test_text[rand].split()
    start_words = " ".join(test_sentence[:2])

    try:
        output = training_model.make_sentence_with_start(
            start_words, max_chars=180
        )
        if output.split()[2] == test_sentence[2]:
            correct += 1
            # print(f'Test words: {start_words}\nModel output: {output}')
        completed_runs += 1
    except:
        continue

print(f"Correct: {correct / total_runs}")
print(f"Completed runs: {(completed_runs - correct) / total_runs}")
print(f"Invalid runs: {(total_runs - completed_runs) / total_runs}")

Correct: 0.0222
Completed runs: 0.1512
Invalid runs: 0.8266


Now we'll test spaCy's POS tagging with markovify to see if it improves text predictions.

In [9]:
nlp = spacy.load("en_core_web_sm")


class POSifiedText(markovify.Text):
    def word_split(self, sentence):
        return ["::".join((word.orth_, word.pos_)) for word in nlp(sentence)]

    def word_join(self, words):
        sentence = " ".join(word.split("::")[0] for word in words)
        return sentence

    def sentence_split(self, text):
        return re.split(r"\s*\n\s*", text)

In [10]:
training_model_2 = POSifiedText(text, state_size=3)

KeyboardInterrupt: 

In [None]:
correct = 0.0
completed_runs = 0

for i in range(total_runs):
    rand = random.randint(0, len(test_text) - 180)
    test_sentence = test_text[rand].split()
    start_words = " ".join(test_sentence[:2])

    try:
        output = training_model_2.make_sentence_with_start(
            start_words, max_chars=180
        )
        if output.split()[2] == test_sentence[2]:
            correct += 1
            # print(f'Test words: {start_words}\nModel output: {output}')
        completed_runs += 1
    except:
        continue

print(f"Correct: {correct / total_runs}")
print(f"Completed runs: {(completed_runs - correct) / total_runs}")
print(f"Invalid runs: {(total_runs - completed_runs) / total_runs}")

Correct: 0.0204
Completed runs: 0.1606
Invalid runs: 0.819
