In [1]:
import nltk
import re
import operator
from collections import defaultdict
import numpy as np

import matplotlib.pyplot as plt

The idea is generate more common sentences according to their word tagging. So the sentences will have the real structure written by lovecraft and composed by a list of most common words in that kind of sentence.

The result should be a somewhat real phrase.

In [2]:
lovecraft = nltk.corpus.PlaintextCorpusReader("lovecraft", ".*")

In [45]:
class TaggedWord(object):
    
    def __init__(self, words, count):
        self.word_hash = {}
        self.words = words
        self.count = count
        index = 0
        for word in words:
            self.word_hash[word] = index
            index += 1

    def update(self, word):
        word_index = self.word_hash.get(word)
        if word_index is not None:
            self.count[word_index] += 1
        else:
            self.words.append(word)
            self.count.append(1)
            word_index = len(self.words) - 1
            self.word_hash[word] = word_index
    
    def get_random(self, seed):
        np.random.seed(seed=seed)
        total_count = sum(self.count)
        probabilities = [word_count/total_count for word_count in self.count]
        random_word_chose = np.random.multinomial(1, probabilities)
        random_word_index = list(random_word_chose).index(1)
        return self.words[random_word_index]


class Sentence(object):
    
    def __init__(self, words, tags):
        self.tags = tags
        self.words = []
        for word in words:
            self.words.append(TaggedWord(words=[word.lower()], count=[1]))
        
    def update(self, words):
        word_index = 0
        for word in words:
            self.words[word_index].update(word.lower())
            word_index += 1
    
    def generate(self, seed):
        return [word.get_random(seed) for word in self.words]

In [46]:
lovecraft_sentences = lovecraft.sents()
sentences = {}
sentence_count = defaultdict(int)
for tokenized_sentence in lovecraft_sentences:
    sentence_with_tagged_words = nltk.pos_tag(tokenized_sentence)
    
    sentence_words = list(zip(*sentence_with_tagged_words))[0]
    sentence_tags = list(zip(*sentence_with_tagged_words))[1]
    
    sentence_checksum = "-".join(sentence_tags)
    
    if sentence_checksum in sentences:
        sentences[sentence_checksum].update(sentence_words)
    else:
        sentences[sentence_checksum] = Sentence(words=sentence_words, tags=sentence_tags)
    
    sentence_count[sentence_checksum] += 1

In [47]:
total_count = sum(sentence_count.values())
sentence_tags = [_sentence_tags for _sentence_tags in sentences.keys()]
sentence_probabilities = [sentence_count[sentence_tag]/total_count for sentence_tag in sentence_tags]


for i in range(0, 3):
    random_sentence_chose = np.random.multinomial(1, sentence_probabilities)
    random_sentence_index = list(random_sentence_chose).index(1)
    print(sentences[sentence_tags[random_sentence_index]].generate(0))

['the', 'bus', ',', 'rather', 'early', ',', 'rattled', 'in', 'with', 'three', 'passengers', 'somewhat', 'before', 'eight', ',', 'and', 'an', 'evil', '-', 'looking', 'fellow', 'on', 'the', 'sidewalk', 'muttered', 'a', 'few', 'indistinguishable', 'words', 'to', 'the', 'driver', '.']
['there', 'seemed', 'virtually', 'nothing', 'to', 'do', 'to', 'calm', 'them', ',', 'and', 'when', 'nahum', 'opened', 'the', 'stable', 'door', 'they', 'all', 'bolted', 'out', 'like', 'frightened', 'woodland', 'deer', '.']
['there', 'seemed', 'virtually', 'nothing', 'to', 'do', 'to', 'calm', 'them', ',', 'and', 'when', 'nahum', 'opened', 'the', 'stable', 'door', 'they', 'all', 'bolted', 'out', 'like', 'frightened', 'woodland', 'deer', '.']


The problem with that approach is that if the author uses a rich grammar (as it is the case of Lovecraft), not many phrases are gramatically repeated,
so we get many unique tagged sentences as it happens here.

In [59]:
print("{} sentences are available and there are {} unique sentences (almost all)".format(len(sentences), len([s for s, c in sentence_count.items() if c == 1])))

print("Sentences with more than one occurrence:")
for cs, count in sentence_count.items():
    if count > 1:
        print("{}: {} times".format(cs, count))

18893 sentences are available and there are 18811 unique sentences (almost all)
Sentences with more than one occurrence:
DT-NNP: 8 times
PRP-.: 8 times
PRP: 8 times
NN: 56 times
DT-NNP-NNP-NNP-NNP: 8 times
NNP-VBD-VBN-.: 3 times
RB-DT-NN-VBD-.: 2 times
PRP-VBD-DT-NN-.: 3 times
DT-NN-VBD-RB-JJ-.: 2 times
IN-NNP-NNP-NNP-NNP: 2 times
DT-NN-.: 3 times
DT-NNP-NNP-NNP-NNP-NNP: 3 times
:-NN: 2 times
DT-NNP-CC-DT-NNP-NNP-.: 3 times
DT-NNP-CC-DT-NN: 4 times
CD: 25 times
NNP-.: 36 times
PRP-VBD-:: 3 times
NNP-VBD-RB-JJ-.: 2 times
PRP-VBP-TO-PRP-VB-,-VB-RB-VB-RP-NNP-IN-PRP-MD-RB-VB-NNS-:-IN-DT-NNP-PRP-VBP-,-VBP-WDT-MD-IN-NNP-VB-RP-NNP-IN-PRP-,-VB-PRP$-NNP-NNPS-MD-RB-VB-IN-NN-.: 2 times
NNP-IN-DT-NNP-,-IN-DT-NNP-MD-RB-VB-TO-NNP-,-CC-JJ-NN-JJR-IN-PRP-.: 2 times
NNP-.-NNP-.: 3 times
NNP: 4 times
NNP-NNP-.: 6 times
NN-.: 28 times
NN-NNP-.: 5 times
POS-NNP-.: 3 times
CD-.: 2 times
NNP-NN-.: 3 times
EX-VBD-DT-NN-.: 2 times
PRP-VBD-IN-VBZ-:: 2 times
JJ-NNP-NN: 2 times
UH-.: 2 times
DT-VBZ-DT-.: 2 times
