### Import libraries and set up random number generator

In [31]:
import numpy as np
from numpy.random import default_rng
seed = 23
rng = default_rng(seed)

In [32]:
'''
Description: This function takes a string of text and turns it into a list where
             each element represents  a single text 'token' including punctuation
             (These are the states of the markov chain). If desired, we can also
             split the text into multitoken elements so that we can consider a
             history of tokens (this breaks the markov property but is interesting
             to explore). 

Input: text - A string of text
       history_size - the number of tokens per element.

Returns: - A list of individual tokens elements in chronological order,
         - A list of all the multitoken elements in chronological order
           (None if history size is 1).
'''

def text_to_list(text,history_size=1):
    # insert spaces so we can use the split function while keeping punctuation
    text = text.replace(". "," . ").replace("! "," ! ").replace("? "," ? ").replace(", "," , ")\
               .replace("; "," ; ").replace(": "," : ").replace("\"","").replace(".\n"," . <NL> ")\
               .replace(",\n"," , <NL> ").replace("!\n"," ! <NL> ").replace("?\n"," ? <NL> ")\
               .replace(":\n"," : <NL> ")

    # replace special characters with tokens
    text = text.replace("\t", " <TAB>").replace("\n", " <NL> ")

    # save the end tokens
    end_tokens = [".", "?", "!"]

    # now split the text using a space as the delimeter
    text_list = text.split()

    # if each token is a state then we are done
    if history_size == 1:
        return text_list, None

    # for multitoken states, combine items in the list
    multitoken_items = []
    idx = 0
    while idx < len(text_list):
        # handle ending punctuation to avoid cases where not at end (e.g. [' . It was'])
        if idx+history_size-1 >= len(text_list):
            break
        # if text_list[idx+history_size-1] in end_tokens:
        #     multitoken_items.append(' '.join(text_list[idx:idx+history_size]))
        #     idx = idx+history_size
        #     continue

        # combine items based on state size
        multitoken_items.append(' '.join(text_list[idx:idx+history_size]))
        idx += 1

    return text_list, multitoken_items

In [33]:
text = "My name is Geffen Cooper. What is yours? "
text_to_list(text,2)

(['My', 'name', 'is', 'Geffen', 'Cooper', '.', 'What', 'is', 'yours', '?'],
 ['My name',
  'name is',
  'is Geffen',
  'Geffen Cooper',
  'Cooper .',
  '. What',
  'What is',
  'is yours',
  'yours ?'])

In [34]:
'''
Description: This function takes the ordered text list and creates
             a markov chain representation from it. Each token represents a state 
             and the token immediately following represents a potential
             next state. We can represent this as a nested dictionary where at
             the first level we have all the unique words in the corpus (i.e. the states)
             and at the second level we have all the potential next states and
             their relative probability of following.
             
             When the history size is greater than one, we will use multiple words to 
             determine the next word. This technically breaks the markov property but
             will enable more realistic text by considering the history of words instead
             of only the current one.

Input: token_list - An ordered list of individual text tokens.
       history_size - Number of past tokens to consider
       multitoken_list - List of multitoken elements for history_size > 1

Returns: A nested dictionary representing the Markov Chain. In the case where the
         history size is greater than 1, the first level will be all the unique
         multitoken elements and the second level will be the possible next individual words.
'''
def gen_word_dist(token_list, history_size=1,multitoken_list=None):

    # case when only consider the current token, markov property is true
    if history_size == 1:
        # create the first level from the unique tokens (all the states)
        unique_tokens = set(token_list)
        text_dict = dict.fromkeys(unique_tokens)

        # create a nested dictionary for each unique token (all the outgoing states)
        for token in text_dict.keys():
            text_dict[token] = {}
        
        # now add the words that follow each unique token
        # where the key is the following word and the value is the count
        for idx,token in enumerate(token_list[1:]):
            try: # try to increment the count of the token
                text_dict[token_list[idx]][token] += 1
            except KeyError: # otherwise set it as the first occurence
                text_dict[token_list[idx]][token] = 1

        # now we convert the counts to probabilities
        for state in text_dict.keys():
            total = sum(text_dict[state].values())
            for out_state in text_dict[state].keys():
                text_dict[state][out_state] = text_dict[state][out_state]/total
                
        return text_dict
    
    # case when past tokens are considered, markov property is broken
    else:
        end_tokens = [".", "?", "!"]
        # create the first level from the unique tokens (all the states)
        unique_sequences = set(multitoken_list)
        text_dict = dict.fromkeys(unique_sequences)

        # create a nested dictionary for each unique token (all the outgoing states)
        for sequence in text_dict.keys():
            text_dict[sequence] = {}
        
        # now add the words that follow each unique sequence
        # where the key is the following word and the value is the count
        token_idx = history_size
        for idx,sequence in enumerate(multitoken_list[:-1]):
            try: # try to increment the count of the token
                text_dict[multitoken_list[idx]][token_list[token_idx]] += 1
            except KeyError: # otherwise set it as the first occurence
                text_dict[multitoken_list[idx]][token_list[token_idx]] = 1
            # if multitoken_list[idx][-1] in end_tokens:
            #     token_idx += history_size
            # else:
            #     token_idx += 1
            token_idx += 1

        # now we convert the counts to probabilities
        for state in text_dict.keys():
            total = sum(text_dict[state].values())
            for out_state in text_dict[state].keys():
                text_dict[state][out_state] = text_dict[state][out_state]/total
                
        return text_dict

In [41]:
text = "He is very tall.\nHe is very happy. "
tl, ml = text_to_list(text,3)
gen_word_dist(tl,3,ml)

{'tall . <NL>': {'He': 1.0},
 '<NL> He is': {'very': 1.0},
 'very happy .': {},
 'He is very': {'tall': 0.5, 'happy': 0.5},
 'is very happy': {'.': 1.0},
 'very tall .': {'<NL>': 1.0},
 '. <NL> He': {'is': 1.0},
 'is very tall': {'.': 1.0}}

In [36]:
'''
Description: This function will take a given state in the markov chain and
             select the next state probabilistically.

Input: The current state as a dictionary.

Returns: The next state.
'''

def get_next_state(current_state):
    out_states = list(current_state.keys())
    probs = list(current_state.values())
    
    probs[0] += 1-sum(probs)
    return rng.choice(a=out_states,size=1,p=probs)

In [37]:
'''
Description: This function will step through the generated markov chain
             to produce sentences based on the input parameters. It will
             format the text as well.

Input: The first word (must be in the corpus), the number of desired sentences,
       the markov chain dictionary

Returns: The formatted output text.
'''

def generate_text(first_word,num_sentences,markov_chain, history_size=1):
    if history_size == 1:
        # get the state from the desired first word
        curr_state = markov_chain[first_word]

        # start the text sequence
        text = [first_word]

        sentence_count = 0
        # keep adding words until we reach the sentence count
        while sentence_count < num_sentences:
            # get the next word
            next_state = get_next_state(curr_state)

            # add this word to the text sequence (only get the string)
            text.append(next_state.tolist()[0])

            # set the current state to the next state (only get the string)
            curr_state = markov_chain[next_state[0]]

            # check if the sentence ended
            if next_state[0] == '.' or next_state[0] == '?' or next_state[0] == '!':
                sentence_count += 1

        # format the text sequence into a sentence
        text_string = " ".join(text)
        text_string = text_string.replace(" . ",". ").replace(" , ",", ").replace(" ; ","; ").replace(" ! ","! ").replace(" ? ","? ").replace(" : ",": ")
        text_string = text_string.replace("<TAB> ","\t").replace("<NL> ","\n")
        text_string_final = text_string[:-2]+text_string[-1]
        return text_string_final
    else:
        end_tokens = [".", "?", "!"]
        # get the state from the desired first word
        curr_state = markov_chain[first_word]

        # start the text sequence
        text = first_word.split()

        sentence_count = 0
        token_idx = history_size
        # keep adding words until we reach the sentence count
        while sentence_count < num_sentences:
            # get the next word
            next_word = get_next_state(curr_state)

            # add this word to the text sequence (only get the string)
            text.append(next_word.tolist()[0])

            # set the current state to the next state (only get the string)
            # if text[token_idx] in end_tokens: # when reach end, go to the start of the sentence
            #     token_idx += history_size

            try:
                curr_state = markov_chain[" ".join(text[token_idx-history_size+1:token_idx+1])]

            except KeyError:
                print("==== Exception ====")
                print(token_idx,text)
                print("\n\n")
                print(text[token_idx-history_size+1])
                print("\n\n")
                print(text[token_idx-history_size+1:token_idx+1])
                break
            # check if there is no next state
            if curr_state == {}:
                curr_state = markov_chain[first_word]

            # check if the sentence ended
            if next_word[0] == '.' or next_word[0] == '?' or next_word[0] == '!':
                sentence_count += 1
            
            token_idx += 1

        # format the text sequence into a sentence
        text_string = " ".join(text)
        text_string = text_string.replace(" . ",". ").replace(" , ",", ").replace(" ; ","; ").replace(" ! ","! ").replace(" ? ","? ").replace(" : ",": ")
        text_string = text_string.replace("<TAB> ","\t").replace("<NL> ","\n")
        text_string_final = text_string[:-2]+text_string[-1]
        return text_string_final

In [39]:
with open('1789-04-30-first-inaugural-address.txt',encoding='utf-8') as f:
    contents = f.read()

with open('2017-01-20-inaugural-address.txt',encoding='utf-8') as f:
    contents = f.read()

with open('email.txt',encoding='utf-8') as f:
    contents = f.read()

#d = gen_word_dist(text_to_list(contents)[0])
tl,ml = text_to_list(contents,3)
# print(tl)
# print("\n\n")
# print(ml)
d = gen_word_dist(tl,2,ml)
print(d)
# print(generate_text("Hi all ,",2,d,3))


{'not have a': {'a': 1.0}, 'over email if': {'if': 1.0}, 'then move to': {'to': 1.0}, ': <NL> <NL>': {'<NL>': 1.0}, 'office hour at': {'at': 1.0}, 'the first 3': {'3': 1.0}, 'a holiday .': {'.': 1.0}, 'Monday is a': {'a': 1.0}, 'still hold the': {'the': 1.0}, 'a class today': {'today': 1.0}, 'holiday , we': {'we': 1.0}, 'email if needed': {'needed': 1.0}, 'hours on Tuesday': {'Tuesday': 1.0}, 'on Monday and': {'and': 1.0}, '. You can': {'can': 1.0}, 'The final project': {'project': 1.0}, 'plots , and': {'and': 1.0}, '. The final': {'final': 1.0}, 'a real data': {'data': 1.0}, 'share your code': {'code': 1.0}, 'notes up to': {'to': 1.0}, 'material in lecture': {'lecture': 1.0}, 'laptop but you': {'you': 1.0}, 'will be attending': {'attending': 1.0}, 'set you find': {'find': 1.0}, 'survey paper discussing': {'discussing': 1.0}, 'continue on Wednesday': {'Wednesday': 1.0}, 'a 3-4 page': {'page': 1.0}, 'have office hours': {'hours': 1.0}, '. We are': {'are': 1.0}, 'show plots ,': {',': 1.0