# Importing Libraries

In [None]:
import numpy as np
import pandas as pd 
import os 
import re 
import string 
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random

# Reading all Data

In [33]:
file_path = 'shakespeare.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    data_content = file.read()

# Split into paragraphs based on '----------'
paragraphs = data_content.split('----------')

def read_all_data(paragraphs):
    txt = []
    for paragraph in paragraphs:
        lines = paragraph.strip().split('\n')
        for line in lines:
            line = line.strip()
            if line != '':
                txt.append(line)
    return txt

text_file = read_all_data(paragraphs)
print("Number of lines:", len(text_file))

Number of lines: 105652


# Cleaning Data 

In [37]:
def clean_txt(txt):
    cleaned_txt = []
    for line in txt :
        line = line.lower()
        line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~<>+=-\\]","",line)
        tokens = word_tokenize(line)
        words = [word for word in tokens if word.isalpha()]
        cleaned_txt += words
    return cleaned_txt

cleaned_text_file =  clean_txt(text_file)
print("number of words = ", len(cleaned_text_file)) 

number of words =  820589


# Generating Markov-Chain Model

In this section, we will see how markov chains are created.

A Markov model is a stochastic model used to predict the probability of a sequence of possible states based on the current state. It assumes that the future state only depends on the present state and not on the sequence of events that preceded it. This property is known as the Markov property. Markov models are widely used in various fields, including natural language processing, for tasks such as text generation, language modeling, and speech recognition.

### Key Concepts

- **State**: In the context of text, a state can be a sequence of words or characters.
- **Transition**: The movement from one state to another.
- **N-gram**: A contiguous sequence of `n` items from a given sample of text or speech. In this case, an n-gram represents a sequence of `n` words.

### Function: `making_markov_model`

The `making_markov_model` function constructs a Markov model from a list of cleaned text. The model is represented as a dictionary where each key is a current state (sequence of words), and the value is another dictionary that maps possible next states to their transition probabilities.

### Diagram of a Markov Model

Below is a visual representation of a Markov model for a better understanding of how states and transitions work.

<img src="MarkovRepresentation.png" alt="Markov Model Diagram" width="400"/>



In [47]:
def making_markov_model(cleaned_text_file, n_gram =2):
    markov_model = {}
    for i in range(len(cleaned_text_file)-n_gram-1):
        curr_state , next_state = "" , "" 
        for j in range(n_gram):
            curr_state += cleaned_text_file[i+j] + " "
            next_state += cleaned_text_file[i+j+n_gram] + " "
        curr_state = curr_state[: -1]
        next_state = next_state[: -1]

        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1
    #calculating state transition probabilities
    for curr_state , transition in markov_model.items():
        total = sum(transition.values())
        for state , count in transition.items():
            markov_model[curr_state][state] = count/total
        
    return markov_model    

In [48]:
markov_model = making_markov_model(cleaned_text_file)

In [49]:
print("number of states = ", len(markov_model.keys()))

number of states =  328736


In [72]:
print("All transition probabilites from  word 'they' state: \n")
print(markov_model['should not'])

## Making Predictions

In [61]:
def generate_predictions(markov_model , limit = 100 , start ='there is'):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story += curr_state +" "
    while n <limit:
        next_state = random.choices(list(markov_model[curr_state].keys()) , 
                                    list(markov_model[curr_state].values()))
        curr_state =next_state[0]
        story += curr_state+" "
        n += 1
    return story

In [73]:
for i in range(20):
    print(str(i)+" ", generate_predictions(markov_model, start = "there was" , limit = 8))

In [74]:
print(generate_predictions(markov_model , start= "once upon" , limit = 100))