In [119]:
import numpy as np
from numpy.random import default_rng
seed = 23
rng = default_rng(seed)

In [144]:
values = ['a','b','c']
p = [0.1,0.5,0.39]
p[0] += 1-sum(p)
rng.choice(a=values,size=1,p=p)

array(['b'], dtype='<U1')

In [120]:
'''
Description: This function takes a string of text and turns it into a list where
             each element is a word or punctuation.

Input: A string of text.

Returns: A list of the individual words and periods in order.
'''

def text_to_list(text):
    # insert spaces so we can use the split function while keeping punctuation
    text = text.replace(". "," . ").replace(", "," , ").replace("; "," ; ").replace("! "," ! ").replace("? "," ? ").replace(": "," : ").replace("\"","")

    # replace special characters with tokens
    text = text.replace("\t", " <TAB>").replace("\n", " <NL> ")

    # now split the text using a space as the delimeter
    items = text.split()
    return items

In [122]:
'''
Description: This function takes the ordered list of text 'tokens' and creates
             a markov chain representation from it. Each token represents a 
             state and the token immediately following represents a potential
             next state. We can represent this as a nested dictionary where at
             the first level we have all the unique words in the corpus (states) and at
             the second level we have all the potential next states and their
             relative probability of following.

Input: A list of text tokens.

Returns: A nested dictionary representing the Markov Chain.
'''
def gen_word_dist(token_list):
    # create the first level from the unique tokens (all the states)
    unique_tokens = set(token_list)
    text_dict = dict.fromkeys(unique_tokens)

    # create a nested dictionary for each unique token (all the outgoing states)
    for token in text_dict.keys():
        text_dict[token] = {}
    
    # now add the words that follow each unique token
    # where the key is the following word and the value is the count
    for idx,token in enumerate(token_list[1:]):
        try: # try to increment the count of the token
            text_dict[token_list[idx]][token] += 1
        except KeyError: # otherwise set it as the first occurence
            text_dict[token_list[idx]][token] = 1

    # now we convert the counts to probabilities
    for state in text_dict.keys():
        total = sum(text_dict[state].values())
        for out_state in text_dict[state].keys():
            text_dict[state][out_state] = text_dict[state][out_state]/total
            
    return text_dict

In [156]:
'''
Description: This function will take a given state in the markov chain and
             select the next state probabilistically.

Input: The current state as a dictionary.

Returns: The next state.
'''

def get_next_state(current_state):
    out_states = list(current_state.keys())
    probs = list(current_state.values())
    
    probs[0] += 1-sum(probs)
    return rng.choice(a=out_states,size=1,p=probs)

In [250]:
'''
Description: This function will step through the generated markov chain
             to produce sentences based on the input parameters. It will
             format the text as well.

Input: The first word (must be in the corpus), the number of desired sentences,
       the markov chain dictionary

Returns: The formatted output text.
'''

def generate_text(first_word,num_sentences,markov_chain):
    # get the state from the desired first word
    curr_state = markov_chain[first_word]

    # start the text sequence
    text = [first_word]

    sentence_count = 0
    # keep adding words until we reach the sentence count
    while sentence_count < num_sentences:
        # get the next word
        next_state = get_next_state(curr_state)

        # add this word to the text sequence (only get the string)
        text.append(next_state.tolist()[0])

        # set the current state to the next state (only get the string)
        curr_state = markov_chain[next_state[0]]

        # check if the sentence ended
        if next_state[0] == '.' or next_state[0] == '?' or next_state[0] == '!':
            sentence_count += 1

    # format the text sequence into a sentence
    text_string = " ".join(text)
    text_string = text_string.replace(" . ",". ").replace(" , ",", ").replace(" ; ","; ").replace(" ! ","! ").replace(" ? ","? ").replace(" : ",": ")
    text_string = text_string.replace("<TAB>","\t").replace("<NL>","\n")
    text_string_final = text_string[:-2]+text_string[-1]
    return text_string_final

In [252]:
text = "Hello, my name is Geffen! What is Yours?\n hi"
with open('1789-04-30-first-inaugural-address.txt') as f:
    contents = f.read()

d = gen_word_dist(text_to_list(contents))
print(generate_text("The",2,d))


The circumstances, on another, who, and advantageously promoted. Instead of this transcendent proof, nor those of public summons, are none under which I must depend.


In [70]:
with open('1789-04-30-first-inaugural-address.txt') as f:
    contents = f.read()
#print(text_to_list(contents))
d = dict.fromkeys(set(text_to_list(contents)))
print(id(d['the']))

140713966737536
