# Generating text with a Markov Chain model created with a dataset

In [2]:
from datasets import load_dataset
import numpy as np
import pandas as pd
import os
import re
import string
import nltk
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
#nltk.download('punkt_tab')
#nltk.download('gutenberg')

## Downloading books from Gutenberg Project (or any other source)

In [4]:
text1 = nltk.corpus.gutenberg.words('austen-persuasion.txt')
text2 = nltk.corpus.gutenberg.words('austen-emma.txt')
text3 = nltk.corpus.gutenberg.words('austen-sense.txt')

#Extra books, not available directly through nltk.corpus.gutenberg.words. Just download the .txt files and tokenize them with word_tokenize 
#before you tokenize with the Tokenize function below

with open("bookJA.txt", "r", encoding="utf-8") as file:
    texttemp1 = file.read()  # Read the entire content of the file. Jane Austen's book "Northanger Abbey"
text4 = word_tokenize(texttemp1)
with open("bookJA1.txt", "r", encoding="utf-8") as file:
    texttemp2 = file.read()  # Read the entire content of the file. Jane Austen's book "Mansfield Park"
text5 = word_tokenize(texttemp2)
with open("bookJA2.txt", "r", encoding="utf-8") as file:
    texttemp3 = file.read()  # Read the entire content of the file. Jane Austen's book "Love and Freindship"
text6 = word_tokenize(texttemp3)

## Parameters of the model

In [6]:
n=3 #Length of n-grams
L=10 #minimum number of n_grams
M=100 #maximum number of n_grams
punctuation_marks = {".", "?", "!"}

## Main Tokenizing function

In [34]:
def Tokenize(txt):
    """Tokenizes and cleans a given text string."""
    txt = " ".join(txt)  # Convert list of words to a single string

    txt = txt.lower()  # Convert to lowercase
    #txt = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=\-\\]", "", txt)  # Remove punctuation
    txt = re.sub(r"[,\"\'@#$%^&*(){}/;`~:<>+=\-\\]", "", txt)  # Remove some characters
    
    tokens = word_tokenize(txt)  # Tokenize properly
    words = [word for word in tokens if word.isalpha() or word in ".!?"]  # Remove non-alphabetic tokens

    return words

#Convert words to string before processing
combined_text = list(text1) + list(text2) + list(text3)+list(text4)+list(text5)+list(text6)  # Convert to standard lists
Tokens = Tokenize(combined_text)  # Correctly tokenize

True

## Defining the model

In [10]:
class MarkovModel:

	def __init__(self, n_gram):
		self.n_gram = n_gram
		self.markov_model = {}

	def build_model(self, text):
		for i in range(len(text)-2*self.n_gram+1):
			curr_state, next_state = "", ""
			for j in range(self.n_gram):
				curr_state += text[i+j] + " "
				next_state += text[i+j+self.n_gram] + " "
			curr_state = curr_state[:-1]
			next_state = next_state[:-1]
			if curr_state not in self.markov_model:
				self.markov_model[curr_state] = {}
				self.markov_model[curr_state][next_state] = 1
			else:
				if next_state in self.markov_model[curr_state]:
					self.markov_model[curr_state][next_state] += 1
				else:
					self.markov_model[curr_state][next_state] = 1

		# calculating transition probabilities
		for curr_state, transition in self.markov_model.items():
			total = sum(transition.values())
			for state, count in transition.items():
				self.markov_model[curr_state][state] = count/total

	def get_model(self):
		return self.markov_model

## Implementing the model on our tokens

In [32]:
markov = MarkovModel(n)
markov.build_model(Tokens)
print("number of states = ", len(markov.get_model().keys()))
print(list(markov.get_model().keys())[:50])

number of states =  464868
['persuasion by jane', 'by jane austen', 'jane austen chapter', 'austen chapter sir', 'chapter sir walter', 'sir walter elliot', 'walter elliot of', 'elliot of kellynch', 'of kellynch hall', 'kellynch hall in', 'hall in somersetshire', 'in somersetshire was', 'somersetshire was a', 'was a man', 'a man who', 'man who for', 'who for his', 'for his own', 'his own amusement', 'own amusement never', 'amusement never took', 'never took up', 'took up any', 'up any book', 'any book but', 'book but the', 'but the baronetage', 'the baronetage there', 'baronetage there he', 'there he found', 'he found occupation', 'found occupation for', 'occupation for an', 'for an idle', 'an idle hour', 'idle hour and', 'hour and consolation', 'and consolation in', 'consolation in a', 'in a distressed', 'a distressed one', 'distressed one there', 'one there his', 'there his faculties', 'his faculties were', 'faculties were roused', 'were roused into', 'roused into admiration', 'into a

False

In [13]:
def generate_sentences(markov, start, min_length, max_length):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story += curr_state + " "
    punctuation_marks = {".", "?", "!"}
    ending = False

    # First loop: Generate text until min_length is reached
    while n < min_length:
        if curr_state not in markov.keys():
            return "N-gram has the wrong size or has no next state"
        next_state = random.choices(list(markov[curr_state].keys()), list(markov[curr_state].values()))
        curr_state = next_state[0]
        story += curr_state + " "
        n += 1

    # Second loop: Continue generating until max_length or punctuation mark is found
    while n < max_length:
        next_state = random.choices(list(markov[curr_state].keys()), list(markov[curr_state].values()))
        for i, char in enumerate(next_state[0]):
            if char in punctuation_marks:  # Truncate the string up to and including the punctuation mark
                final_state = next_state[0][:i + 1]
                story += final_state
                ending = True
                break
        if ending == True:
            break
        curr_state = next_state[0]
        story += curr_state + " "
        n += 1

    return story

In [36]:
# Generate 10 sentences
for i in range(10):
	print(str(i)+". ", generate_sentences(
		markov.get_model(), start = 'his faculties were', min_length=L, max_length = M))

0.  his faculties were roused into admiration and respect by contemplating the limited remnant of the earliest patents there any unwelcome sensations arising from domestic affairs changed naturally into pity and contempt as he turned over the almost endless creations of the last epistolary uses she could put them to .
1.  his faculties were roused into admiration and respect by contemplating the limited remnant of the earliest patents there any unwelcome sensations arising from domestic affairs changed naturally into pity and contempt as he turned over the almost endless creations of the last fortnight and the present situation of matters at mansfield were known to him .
2.  his faculties were roused into admiration and respect by contemplating the limited remnant of the earliest patents there any unwelcome sensations arising from domestic affairs changed naturally into pity and contempt as he turned over the almost endless creations of the last who would wish to represent it on the st