In [40]:
import os
import re
import nltk
import string
import random
import numpy as np
import scipy.linalg
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/franciscofurey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [41]:
# State representation
def get_states() -> dict:
    """Return the mapping of state indices to state names."""
    return {
        0: "Burger",
        1: "Pizza",
        2: "Hotdog"
    }

# Transition Matrix
def get_transition_matrix() -> np.ndarray:
    """Return the transition matrix A."""
    return np.array([[0.2, 0.6, 0.2], [0.3, 0.0, 0.7], [0.5, 0.0, 0.5]])


In [42]:
def random_walk(start_state: int, steps: int) -> None:
    """Perform a random walk on the Markov chain given a start state and number of steps."""
    A = get_transition_matrix()
    state = get_states()
    curr_state = start_state
    print(state[curr_state], "--->", end=" ")

    while steps - 1:
        curr_state = np.random.choice([0, 1, 2], p=A[curr_state])
        print(state[curr_state], "--->", end=" ")
        steps -= 1
    print("stop")


In [43]:
def monte_carlo(start_state: int, steps: int) -> None:
    """Estimate steady state probabilities using the Monte Carlo approach."""
    A = get_transition_matrix()
    curr_state = start_state
    pi = np.array([0, 0, 0])
    pi[start_state] = 1

    for i in range(steps):
        curr_state = np.random.choice([0, 1, 2], p=A[curr_state])
        pi[curr_state] += 1

    print("π = ", pi / steps)


In [44]:
def repeated_matrix_multiplication(steps: int) -> None:
    """Calculate steady state probabilities using repeated matrix multiplication."""
    A = get_transition_matrix()
    A_n = A

    for i in range(steps):
        A_n = np.matmul(A_n, A)

    print("A^n = \n", A_n, "\n")
    print("π = ", A_n[0])


In [45]:
def find_left_eigen_vectors() -> None:
    """Find and display left eigen vectors and their normalized steady state probabilities."""
    A = get_transition_matrix()
    values, left = scipy.linalg.eig(A, right=False, left=True)

    print("left eigen vectors = \n", left, "\n")
    print("eigen values = \n", values)

    pi = left[:, 0]
    pi_normalized = [(x / np.sum(pi)).real for x in pi]
    print("Normalized π = ", pi_normalized)
    return pi_normalized


In [46]:
def find_prob(seq: list, A: np.ndarray, pi: list) -> float:
    """Calculate the probability of a given sequence of states."""
    start_state = seq[0]
    prob = pi[start_state]
    prev_state = start_state
    for i in range(1, len(seq)):
        curr_state = seq[i]
        prob *= A[prev_state][curr_state]
        prev_state = curr_state
    return prob


In [47]:
# Initial setup
states = get_states()
transition_matrix = get_transition_matrix()

# Perform a random walk
random_walk(start_state=0, steps=15)

# Monte Carlo approach
monte_carlo(start_state=0, steps=10**6)

# Repeated matrix multiplication
repeated_matrix_multiplication(steps=10**3)

# Find left eigen vectors and their normalized steady state probabilities
pi_normalized = find_left_eigen_vectors()

# Calculate probability for a sequence
sequence_prob = find_prob(seq=[1, 2, 2, 0], A=transition_matrix, pi=pi_normalized)
print("Sequence Probability: ", sequence_prob)


Burger ---> Pizza ---> Hotdog ---> Hotdog ---> Burger ---> Pizza ---> Burger ---> Pizza ---> Hotdog ---> Hotdog ---> Burger ---> Pizza ---> Burger ---> Pizza ---> Burger ---> stop
π =  [0.351911 0.210957 0.437133]
A^n = 
 [[0.35211268 0.21126761 0.43661972]
 [0.35211268 0.21126761 0.43661972]
 [0.35211268 0.21126761 0.43661972]] 

π =  [0.35211268 0.21126761 0.43661972]
left eigen vectors = 
 [[-0.58746336+0.j         -0.16984156-0.35355339j -0.16984156+0.35355339j]
 [-0.35247801+0.j          0.67936622+0.j          0.67936622-0.j        ]
 [-0.72845456+0.j         -0.50952467+0.35355339j -0.50952467-0.35355339j]] 

eigen values = 
 [ 1.  +0.j        -0.15+0.3122499j -0.15-0.3122499j]
Normalized π =  [0.3521126760563379, 0.2112676056338029, 0.43661971830985913]
Sequence Probability:  0.036971830985915506


In [3]:
# The statespace
states = ["Sleep","Icecream","Run"]

# Possible sequences of events
transitionName = [["SS","SR","SI"],["RS","RR","RI"],["IS","IR","II"]]

# Probabilities matrix (transition matrix)
transitionMatrix = [[0.2,0.6,0.2],[0.1,0.6,0.3],[0.2,0.7,0.1]]

In [4]:
if sum(transitionMatrix[0]) + sum(transitionMatrix[1]) + sum(transitionMatrix[1]) != 3:
    print("Somewhere, something went wrong. Transition matrix, perhaps?")
else: 
    print("All is gonna be okay, you should move on!! ;)")    

All is gonna be okay, you should move on!! ;)


In [5]:
# A function that implements the Markov model to forecast the state/mood. 
def activity_forecast(days):
    # Choose the starting state
    activityToday = "Sleep"
    print("Start state: " + activityToday)
    # Shall store the sequence of states taken. So, this only has the starting state for now.
    activityList = [activityToday]
    i = 0
    # To calculate the probability of the activityList
    prob = 1
    while i != days:
        if activityToday == "Sleep":
            change = np.random.choice(transitionName[0],replace=True,p=transitionMatrix[0])
            if change == "SS":
                prob = prob * 0.2
                activityList.append("Sleep")
                pass
            elif change == "SR":
                prob = prob * 0.6
                activityToday = "Run"
                activityList.append("Run")
            else:
                prob = prob * 0.2
                activityToday = "Icecream"
                activityList.append("Icecream")
        elif activityToday == "Run":
            change = np.random.choice(transitionName[1],replace=True,p=transitionMatrix[1])
            if change == "RR":
                prob = prob * 0.5
                activityList.append("Run")
                pass
            elif change == "RS":
                prob = prob * 0.2
                activityToday = "Sleep"
                activityList.append("Sleep")
            else:
                prob = prob * 0.3
                activityToday = "Icecream"
                activityList.append("Icecream")
        elif activityToday == "Icecream":
            change = np.random.choice(transitionName[2],replace=True,p=transitionMatrix[2])
            if change == "II":
                prob = prob * 0.1
                activityList.append("Icecream")
                pass
            elif change == "IS":
                prob = prob * 0.2
                activityToday = "Sleep"
                activityList.append("Sleep")
            else: 
                prob = prob * 0.7
                activityToday = "Run"
                activityList.append("Run")
        i += 1  
    print("Possible states: " + str(activityList))
    print("End state after "+ str(days) + " days: " + activityToday)
    print("Probability of the possible sequence of states: " + str(prob))

# Function that forecasts the possible state for the next 2 days
activity_forecast(2)

Start state: Sleep
Possible states: ['Sleep', 'Run', 'Icecream']
End state after 2 days: Icecream
Probability of the possible sequence of states: 0.18


# Sherlock Holmes Example ussing Markov Chains

In [9]:
import os
import re
import nltk
import string
import random
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/franciscofurey/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [21]:
# Define the path to the directory containing the stories.
story_path = './data/sherlock/sherlock/'

def read_all_stories(story_path: str) -> list:
    """
    Reads all text files from a specified directory, stopping at a delimiter or an empty line.
    
    Parameters:
        story_path (str): The path to the directory containing story files.
    
    Returns:
        list: A list of lines from all files, excluding delimiter lines and empty lines.
    """
    lines = []
    for _, _, files in os.walk(story_path):
        for file in files:
            with open(os.path.join(story_path, file)) as f:
                for line in f:
                    line = line.strip()
                    if line == '----------': break
                    if line:
                        lines.append(line)
    return lines
        
stories = read_all_stories(story_path=story_path)
print("Number of lines = ", len(stories))

def clean_txt(txt: list) -> list:
    """
    Cleans a list of text lines by converting to lowercase, removing punctuation, and tokenizing.
    
    Parameters:
        txt (list): A list of text lines to be cleaned.
    
    Returns:
        list: A list of cleaned and tokenized words.
    """
    cleaned_words = []
    for line in txt:
        line = line.lower()
        line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\\[\]]", "", line)
        tokens = word_tokenize(line)
        # Filter out tokens that are not alphabetic
        words = [word for word in tokens if word.isalpha()]
        cleaned_words += words
    return cleaned_words

cleaned_stories = clean_txt(txt=stories)
print("Number of words = ", len(cleaned_stories))


Number of lines =  215021
Number of words =  2332247


In [22]:
def make_markov_model(cleaned_stories: list, n_gram: int = 2) -> dict:
    """
    Constructs a Markov model from a list of words.

    This function creates a dictionary representing the transitions between n-gram states
    to the next possible states along with their probabilities.

    Parameters:
        cleaned_stories (list): A list of cleaned and tokenized words from stories.
        n_gram (int): The number of words in the state used for the Markov model.

    Returns:
        dict: A dictionary representing the Markov model where keys are current states
              and values are dictionaries of next possible states and their probabilities.
    """
    markov_model = {}  # Initialize an empty dictionary for the Markov model

    # Loop through the list of words to populate the Markov model
    for i in range(len(cleaned_stories) - n_gram - 1):
        curr_state, next_state = "", ""
        # Construct the current and next state by concatenating words
        for j in range(n_gram):
            curr_state += cleaned_stories[i+j] + " "
            next_state += cleaned_stories[i+j+n_gram] + " "
        curr_state = curr_state[:-1]  # Remove the trailing space
        next_state = next_state[:-1]

        # If the current state is not in the model, add it
        if curr_state not in markov_model:
            markov_model[curr_state] = {next_state: 1}
        else:
            # If the next state exists, increment; otherwise, add it with a count of 1
            markov_model[curr_state].setdefault(next_state, 0)
            markov_model[curr_state][next_state] += 1
    
    # Calculate transition probabilities for each state
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count / total  # Convert counts to probabilities
        
    return markov_model

# Example usage
markov_model = make_markov_model(cleaned_stories=cleaned_stories)
print("Number of states = ", len(markov_model.keys()))
print("All possible transitions from 'the game' state: \n")
if 'the game' in markov_model:
    print(markov_model['the game'])
else:
    print("The state 'the game' does not exist in the model.")

Number of states =  208717
All possible transitions from 'the game' state: 

{'is up': 0.06306306306306306, 'is and': 0.036036036036036036, 'was afoot': 0.036036036036036036, 'for the': 0.036036036036036036, 'was whist': 0.036036036036036036, 'would have': 0.036036036036036036, 'in their': 0.036036036036036036, 'was up': 0.09009009009009009, 'in that': 0.036036036036036036, 'the lack': 0.036036036036036036, 'for all': 0.06306306306306306, 'is afoot': 0.036036036036036036, 'was in': 0.02702702702702703, 'is hardly': 0.02702702702702703, 'may wander': 0.02702702702702703, 'now a': 0.02702702702702703, 'my own': 0.02702702702702703, 'at any': 0.02702702702702703, 'mr holmes': 0.02702702702702703, 'ay whats': 0.02702702702702703, 'my friend': 0.02702702702702703, 'fairly by': 0.02702702702702703, 'is not': 0.02702702702702703, 'was not': 0.02702702702702703, 'worth it': 0.02702702702702703, 'you are': 0.02702702702702703, 'i am': 0.02702702702702703, 'now count': 0.02702702702702703, 'your

In [32]:
def generate_story(markov_model: dict, limit: int = 100, start: str = 'my god') -> str:
    """
    Generates a story based on a given Markov model, starting from a specified state.

    Parameters:
        markov_model (dict): The Markov model to use for generating the story.
        limit (int): The maximum number of words in the generated story.
        start (str): The starting state (words) for the story.

    Returns:
        str: A string representing the generated story.
    """
    n = 0  # Initialize word counter
    curr_state = start  # Set the current state to the starting words
    story = curr_state + " "  # Initialize the story with the starting state

    # Generate the story up to the word limit
    while n < limit and curr_state in markov_model:
        # Choose the next state based on the distribution in the Markov model
        next_state = random.choices(
            population=list(markov_model[curr_state].keys()),
            weights=list(markov_model[curr_state].values()),
            k=1  # Choose one next state
        )[0]

        story += next_state + " "  # Append the next state to the story
        curr_state = next_state  # Update the current state to the next state
        n += 1  # Increment word counter

    return story.strip()  # Return the story, removing any trailing space

# Example usage: Generate and print 20 short stories
for i in range(20):
    story = generate_story(markov_model=markov_model, start="dear holmes", limit=8)
    print(f"{i}. {story}")

print("-------------------")
print("A longer story:")
print(generate_story(markov_model, start="the case", limit=100))


0. dear holmes i fear that this second man that has to be in better form both mental and
1. dear holmes that i should register and interject if i could have laughed when i realized it as
2. dear holmes and tell her that he loved but the path ran right on to kilburn there was
3. dear holmes he has had his love but now i asked he has given us a yellow bar
4. dear holmes am i sir he wasnt a dead man i heard a cry and supported himself against
5. dear holmes i ejaculated as a child but when chance threw you in my thoughts standing there on
6. dear holmes i ejaculated my dear holmes said he laying down his pipe and skipping over the pages
7. dear holmes i fear a very stout portly man in the world for thoroughness and method of it
8. dear holmes i have not seen this lonely silent house and the fury of the man blessington approached
9. dear holmes i have as you have those ive been dreaming of the bright spring sunshine behind one
10. dear holmes i ejaculated precisely so said mr holmes and yo

# Second example of Normalized Nerd