In [1]:
import nltk
import re
import numpy as np

# Simple generation of random text based in bigrams

In [2]:
lovecraft = nltk.corpus.PlaintextCorpusReader("lovecraft", ".*")

In [3]:

def generate_model_from_max_probable_word(cfdist, word, num=15):
    words = [word]
    for i in range(num):
        word = cfdist[word].max()
        words.append(word)
    return words


def get_random_word_from_fd(fd):
    # Construct probabilities for each word that belongs to the frequency distribution
    word_prob_pairs = [(word_i,word_i_freq/fd.N()) for word_i,word_i_freq in fd.items()]
    words = [word_prob_pair[0] for word_prob_pair in word_prob_pairs]
    probabilities = [word_prob_pair[1] for word_prob_pair in word_prob_pairs]

    # Select a random chose according to the probabilities of each word
    random_word_chose = np.random.multinomial(1, probabilities)
    random_word_index = list(random_word_chose).index(1)
    return words[random_word_index]


def generate_model_from_word(cfdist, word, num=15):
    words = []
    for i in range(num):
        words.append(word)
        word_fd = cfdist[word]
        word = get_random_word_from_fd(word_fd)
    return words


def generate_model(cfdist, fd, num=15):
    words = []
    word = get_random_word_from_fd(fd)
        
    for i in range(num):
        words.append(word)
        word_fd = cfdist[word]
        word = get_random_word_from_fd(word_fd)
    return words

In [4]:
# Ignore all non-alphabetic words (numbers, punctuation symbols, etc.).
text = [word for word in lovecraft.words() if word.isalpha()]

# Get all bigrams of the text (bigrams are pairs of words that appears in the text)
bigrams = nltk.bigrams(text)

# Compute their conditional frequency for each word
cfd = nltk.ConditionalFreqDist(bigrams)

# Compute the frequency (number of appearances of each word)
fd = nltk.FreqDist(text)

In [5]:
# Generate 200 words selecting each time a random word from the next possible ones acccording to its probability distribution
random_words = generate_model_from_word(cfd, "Cthulhu", num=200)

# Generate 200 words based on the max probability of the next word
random_text_with_max_probability = generate_model_from_max_probable_word(cfd, "Cthulhu", num=200)

print("Generate 200 words selecting each time a random word from the next possible ones acccording to its probability distribution:\n\n{}\n\n\n".format(random_words))

print("Generate 200 words based on the max probability of the next word:\n\n{}\n\n\n".format(random_text_with_max_probability))

Generate 200 words selecting each time a random word from the next possible ones acccording to its probability distribution:

['Cthulhu', 'still', 'be', 'that', 'the', 'investigations', 'Late', 'in', 'a', 'comets', 'suns', 'and', 'cut', 'from', 'the', 'lake', 'itself', 'into', 'that', 'if', 'as', 'loudly', 'and', 'Atal', 'was', 'then', 'jest', 'then', 'remain', 'as', 'low', 'sun', 'rose', 'and', 'had', 'predicted', 'that', 'these', 'men', 'have', 'said', 'so', 'sensationally', 'reported', 'seeing', 'hearing', 'voices', 'in', 'the', 'crowd', 'fancied', 'that', 'which', 'I', 'spent', 'and', 'with', 'their', 'more', 'than', 'Mr', 'Brown', 'amp', 'O', 'course', 'determined', 'to', 'find', 'the', 'great', 'problem', 'is', 'not', 'any', 'message', 'came', 'to', 'divert', 'my', 'talk', 'of', 'chemicals', 'which', 'I', 'fled', 'from', 'things', 'in', 'the', 'decayed', 'louvre', 'boarded', 'up', 'paper', 'Kuranes', 'was', 'less', 'explicable', 'by', 'another', 'bulletin', 'announced', 'the', 's

In [6]:
# Get the type of words of each word
def format_random_text(generated_words):
    generated_tagged_words = nltk.tag.pos_tag(generated_words)
    word_type = dict(generated_tagged_words)

    # Add a "." before each capital letter that is not a proper name or "I"
    text = ""
    for i in range(0, len(list(generated_words))):
        if i >= len(list(generated_words)) - 1:
            text += " "+generated_words[i]+"."
        else:
            text += " "+generated_words[i] + ("." if generated_words[i+1][0].isupper() and word_type.get(generated_words[i+1]) not in ("NNP", "POS") and generated_words[i+1] != "I" else "")

    return text.strip()

In [7]:
# Examples
print("Generate 200 word text selecting each time a random word from the next possible ones acccording to its probability distribution:\n\n{}\n\n\n".format(format_random_text(random_words)))

print("Generate 200 words based on the max probability of the next word:\n\n{}\n\n\n".format(format_random_text(random_text_with_max_probability)))


Generate 200 word text selecting each time a random word from the next possible ones acccording to its probability distribution:

Cthulhu still be that the investigations Late in a comets suns and cut from the lake itself into that if as loudly and Atal was then jest then remain as low sun rose and had predicted that these men have said so sensationally reported seeing hearing voices in the crowd fancied that which I spent and with their more than Mr Brown amp O course determined to find the great problem is not any message came to divert my talk of chemicals which I fled from things in the decayed louvre boarded up paper Kuranes was less explicable by another bulletin announced the same things that in for the cliff as brought from Nahum s death of the witch s books ranged along deserted brick blocks loose upon the rest tall lighthouse keeper of it until one of Celephais in the horror was frankly that others. Viscous obstacles. It was a frontage of a frenzied and were unnerving influen