In [8]:
import nltk
import re
import numpy as np

# Simple generation of random text based in bigrams

In [9]:
lovecraft = nltk.corpus.PlaintextCorpusReader("lovecraft", ".*")

In [15]:

def generate_model_from_max_probable_word(cfdist, word, num=15):
    words = [word]
    for i in range(num):
        word = cfdist[word].max()
        words.append(word)
    return words


def get_random_word_from_fd(fd):
    # Construct probabilities for each word that belongs to the frequency distribution
    word_prob_pairs = [(word_i,word_i_freq/fd.N()) for word_i,word_i_freq in fd.items()]
    words = [word_prob_pair[0] for word_prob_pair in word_prob_pairs]
    probabilities = [word_prob_pair[1] for word_prob_pair in word_prob_pairs]

    # Select a random chose according to the probabilities of each word
    random_word_chose = np.random.multinomial(1, probabilities)
    random_word_index = list(random_word_chose).index(1)
    return words[random_word_index]


def generate_model_from_word(cfdist, word, num=15):
    words = []
    for i in range(num):
        words.append(word)
        word_fd = cfdist[word]
        word = get_random_word_from_fd(word_fd)
    return words


def generate_model(cfdist, fd, num=15):
    words = []
    word = get_random_word_from_fd(fd)
        
    for i in range(num):
        words.append(word)
        word_fd = cfdist[word]
        word = get_random_word_from_fd(word_fd)
    return words

In [16]:
# Ignore all non-alphabetic words (numbers, punctuation symbols, etc.).
text = [word for word in lovecraft.words() if word.isalpha()]

# Get all bigrams of the text (bigrams are pairs of words that appears in the text)
bigrams = nltk.bigrams(text)

# Compute their conditional frequency for each word
cfd = nltk.ConditionalFreqDist(bigrams)

# Compute the frequency (number of appearances of each word)
fd = nltk.FreqDist(text)

In [28]:
# Generate 200 words selecting each time a random word from the next possible ones acccording to its probability distribution
random_words = generate_model_from_word(cfd, "Cthulhu", num=200)

# Generate 200 words based on the max probability of the next word
random_text_with_max_probability = generate_model_from_max_probable_word(cfd, "Cthulhu", num=200)

print("Generate 200 words selecting each time a random word from the next possible ones acccording to its probability distribution:\n\n{}\n\n\n".format(random_words))

print("Generate 200 words based on the max probability of the next word:\n\n{}\n\n\n".format(random_text_with_max_probability))

Generate 200 words selecting each time a random word from the next possible ones acccording to its probability distribution:

['Cthulhu', 'R', 'lyehian', 'which', 'were', 'many', 'of', 'one', 'ever', 'saw', 'that', 'I', 'am', 'crazy', 'things', 'which', 'atoned', 'for', 'this', 'abhorred', 'and', 'down', 'the', 'stony', 'cliff', 'the', 'companionship', 'of', 'its', 'outlines', 'of', 'suction', 'With', 'Sunne', 'in', 'Greek', 'inscription', 'above', 'the', 'high', 'rock', 'when', 'we', 'floated', 'from', 'shots', 'broke', 'off', 'the', 'James', 'Manning', 'was', 'a', 'white', 'fungous', 'growths', 'predominated', 'some', 'day', 'you', 'were', 'like', 'the', 'South', 'who', 'assisted', 'in', 'the', 'early', 'existence', 'At', 'twenty', 'seventh', 'of', 'the', 'creaking', 'sound', 'of', 'the', 'solution', 'was', 'said', 'but', 'the', 'Gugs', 'kingdom', 'would', 'be', 'made', 'Asenath', 's', 'more', 'The', 'living', 'denizen', 'of', 'any', 'known', 'till', 'it', 'Drawing', 'inside', 'coat'

In [25]:
# Get the type of words of each word
def format_random_text(generated_words):
    generated_tagged_words = nltk.tag.pos_tag(generated_words)
    word_type = dict(generated_tagged_words)

    # Add a "." before each capital letter that is not a proper name or "I"
    text = ""
    for i in range(0, len(list(generated_words))):
        if i >= len(list(generated_words)) - 1:
            text += " "+generated_words[i]+"."
        else:
            text += " "+generated_words[i] + ("." if generated_words[i+1][0].isupper() and word_type.get(generated_words[i+1]) not in ("NNP", "POS") and generated_words[i+1] != "I" else "")

    return text.strip()

In [30]:
# Examples
print("Generate 200 word text selecting each time a random word from the next possible ones acccording to its probability distribution:\n\n{}\n\n\n".format(format_random_text(random_words)))

print("Generate 200 words based on the max probability of the next word:\n\n{}\n\n\n".format(format_random_text(random_text_with_max_probability)))


Generate 200 word text selecting each time a random word from the next possible ones acccording to its probability distribution:

Cthulhu R lyehian which were many of one ever saw that I am crazy things which atoned for this abhorred and down the stony cliff the companionship of its outlines of suction. With Sunne in Greek inscription above the high rock when we floated from shots broke off the James Manning was a white fungous growths predominated some day you were like the South who assisted in the early existence. At twenty seventh of the creaking sound of the solution was said but the Gugs kingdom would be made Asenath s more. The living denizen of any known till it Drawing inside coat lapel and was a notably ferns calamites whose windings of the torture myself and means as little windows at face that you at the library of her I saw the most of their linkage of the vegetation in William J C IVLIVS VERVS MAXIMINVS THE NAMELESS CITY. When it was a development occurred and confront th