### Import libraries and set up random number generator

In [2]:
import numpy as np
from numpy.random import default_rng
seed = 23
rng = default_rng(seed)

In [29]:
'''
Description: This function takes a string of text and turns it into a list where
             each element represents  a single text 'token' including punctuation
             (These are the states of the markov chain). If desired, we can also
             split the text into multitoken elements so that we can consider a
             history of tokens (this breaks the markov property but is interesting
             to explore). 

Input: text - A string of text
       history_size - the number of tokens per element.

Returns: - A list of individual tokens elements in chronological order,
         - A list of all the multitoken elements in chronological order
           (None if history size is 1).
'''

def text_to_list(text,history_size=1):
    # insert spaces so we can use the split function while keeping punctuation
    text = text.replace("."," . ").replace("!"," ! ").replace("?"," ? ").replace(", "," , ")\
               .replace("; "," ; ").replace(": "," : ").replace("\"","").replace(".\n"," . <NL> ")\
               .replace("!\n"," . <NL> ").replace("?\n"," . <NL> ")

    # replace special characters with tokens
    text = text.replace("\t", " <TAB>").replace("\n", " <NL> ")

    # save the end tokens
    end_tokens = [".", "?", "!"]

    # now split the text using a space as the delimeter
    text_list = text.split()

    # if each token is a state then we are done
    if history_size == 1:
        return text_list, None

    # for multitoken states, combine items in the list
    multitoken_items = []
    idx = 0
    while idx < len(text_list):
        # handle ending punctuation to avoid cases where not at end (e.g. [' . It was'])
        if idx+history_size-1 >= len(text_list):
            break
        if text_list[idx+history_size-1] in end_tokens:
            multitoken_items.append(' '.join(text_list[idx:idx+history_size]))
            idx = idx+history_size
            continue

        # combine items based on state size
        multitoken_items.append(' '.join(text_list[idx:idx+history_size]))
        idx += 1

    return text_list, multitoken_items

In [30]:
text = "My name is Geffen Cooper. What is yours?"
text_to_list(text,2)

(['My', 'name', 'is', 'Geffen', 'Cooper', '.', 'What', 'is', 'yours', '?'],
 ['My name',
  'name is',
  'is Geffen',
  'Geffen Cooper',
  'Cooper .',
  'What is',
  'is yours',
  'yours ?'])

In [326]:
'''
Description: This function takes the ordered text list and creates
             a markov chain representation from it. Each token represents a state 
             and the token immediately following represents a potential
             next state. We can represent this as a nested dictionary where at
             the first level we have all the unique words in the corpus (i.e. the states)
             and at the second level we have all the potential next states and
             their relative probability of following.
             
             When the history size is greater than one, we will use multiple words to 
             determine the next word. This technically breaks the markov property but
             will enable more realistic text by considering the history of words instead
             of only the current one.

Input: token_list - An ordered list of individual text tokens.
       history_size - Number of past tokens to consider
       multitoken_list - List of multitoken elements for history_size > 1

Returns: A nested dictionary representing the Markov Chain.
'''
def gen_word_dist(token_list, history_size=1,multitoken_list=None):
    # create the first level from the unique tokens (all the states)
    unique_tokens = set(token_list)
    text_dict = dict.fromkeys(unique_tokens)

    # create a nested dictionary for each unique token (all the outgoing states)
    for token in text_dict.keys():
        text_dict[token] = {}
    
    # now add the words that follow each unique token
    # where the key is the following word and the value is the count
    for idx,token in enumerate(token_list[1:]):
        try: # try to increment the count of the token
            text_dict[token_list[idx]][token] += 1
        except KeyError: # otherwise set it as the first occurence
            text_dict[token_list[idx]][token] = 1

    # now we convert the counts to probabilities
    for state in text_dict.keys():
        total = sum(text_dict[state].values())
        for out_state in text_dict[state].keys():
            text_dict[state][out_state] = text_dict[state][out_state]/total
            
    return text_dict

In [327]:
'''
Description: This function will take a given state in the markov chain and
             select the next state probabilistically.

Input: The current state as a dictionary.

Returns: The next state.
'''

def get_next_state(current_state):
    out_states = list(current_state.keys())
    probs = list(current_state.values())
    
    probs[0] += 1-sum(probs)
    return rng.choice(a=out_states,size=1,p=probs)

In [328]:
'''
Description: This function will step through the generated markov chain
             to produce sentences based on the input parameters. It will
             format the text as well.

Input: The first word (must be in the corpus), the number of desired sentences,
       the markov chain dictionary

Returns: The formatted output text.
'''

def generate_text(first_word,num_sentences,markov_chain):
    # get the state from the desired first word
    curr_state = markov_chain[first_word]

    # start the text sequence
    text = [first_word]

    sentence_count = 0
    # keep adding words until we reach the sentence count
    while sentence_count < num_sentences:
        # get the next word
        next_state = get_next_state(curr_state)

        # add this word to the text sequence (only get the string)
        text.append(next_state.tolist()[0])

        # set the current state to the next state (only get the string)
        curr_state = markov_chain[next_state[0]]

        # check if the sentence ended
        if next_state[0] == '.' or next_state[0] == '?' or next_state[0] == '!':
            sentence_count += 1

    # format the text sequence into a sentence
    text_string = " ".join(text)
    text_string = text_string.replace(" . ",". ").replace(" , ",", ").replace(" ; ","; ").replace(" ! ","! ").replace(" ? ","? ").replace(" : ",": ")
    text_string = text_string.replace("<TAB> ","\t").replace("<NL> ","\n")
    text_string_final = text_string[:-2]+text_string[-1]
    return text_string_final

In [426]:
with open('1789-04-30-first-inaugural-address.txt',encoding='utf-8') as f:
    contents = f.read()

with open('2017-01-20-inaugural-address.txt',encoding='utf-8') as f:
    contents = f.read()

with open('email.txt',encoding='utf-8') as f:
    contents = f.read()

d = gen_word_dist(text_to_list(contents))
print(generate_text("Hi",2,d))


Hi all, 
Hi all, 
Hi all, 
Best, 


Hi all, 
I think it would be good to extend the office is going to answer questions over zoom since my office hour on Monday: 1) implementing ML algorithm(s) learned in the midterm exam. The final project report.
