# Character n-gram models

First, we need some training data.

In [1]:
import nltk
from nltk.corpus import gutenberg
from ngram import NGramModel
import numpy as np

nltk.download('gutenberg')

print("Available books:", gutenberg.fileids())

Available books: ['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/fredrik/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [3]:
fileids = gutenberg.fileids()[:3]

def make_unigram_models(fileid):
    from nltk.corpus import gutenberg
    return NGramModel(gutenberg.words(fileid), 1)
models = list(map(make_unigram_models, fileids))

print("Created %i models" % len(models))

for fid, m in zip(fileids, models):
    print(gutenberg.words(fid), "\t", m)

Created 3 models
['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...] 	 1-gram model with 7811 unique keys
['[', 'Persuasion', 'by', 'Jane', 'Austen', '1818', ...] 	 1-gram model with 6132 unique keys
['[', 'Sense', 'and', 'Sensibility', 'by', 'Jane', ...] 	 1-gram model with 6833 unique keys


The set of all english words in the corpus will also come in handy.

In [4]:
def word_isalpha(word):
    for c in word:
        if not c.isalpha():
            return False
    return True

english_words = set()
for m in models:
    words = [k[0].lower() for k in list(m.keys())]
    english_words.update(set([w for w in words if word_isalpha(w)]))
    print("We have %i english words so far" % len(english_words))

We have 7079 english words so far
We have 8824 english words so far
We have 10294 english words so far


In [5]:
from random import choices
print(choices(list(english_words), k=100))


['undressed', 'congratulatory', 'expedient', 'stilton', 'admire', 'ought', 'escorted', 'eight', 'serviceable', 'gallantry', 'favourites', 'knightley', 'untowardly', 'properer', 'swiftly', 'established', 'provide', 'acknowledge', 'premeditated', 'drove', 'proving', 'conundrums', 'disappearing', 'volumes', 'reliance', 'squire', 'song', 'substantial', 'certain', 'suspecting', 'delicately', 'tis', 'crosser', 'acts', 'minutes', 'compliments', 'wedging', 'implicit', 'defies', 'newspaper', 'blunders', 'counter', 'view', 'captivating', 'varied', 'dr', 'variance', 'lavish', 'livings', 'incurring', 'marking', 'enjoy', 'consulting', 'wronged', 'memorandum', 'stammered', 'commonplace', 'peculiar', 'apart', 'courtland', 'obligations', 'collar', 'marmion', 'unwillingness', 'relating', 'madam', 'sharing', 'afforded', 'vacancies', 'company', 'measure', 'incurring', 'nash', 'passed', 'behaved', 'headaches', 'mine', 'mistake', 'medals', 'hasty', 'warmly', 'tyrannic', 'regulate', 'sharer', 'returned', 't

Let's clean up some Austen books as training data for character models

In [6]:
austen_text = [gutenberg.raw(fid) for fid in ['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt']]
print(austen_text[0][:600])

def generate_alphabet(alpha, omega):
    """Set of the english alphabet"""
    return set([chr(i) for i in range(ord(alpha), ord(omega)+1)]) 

def clean_text(text, allowed):
    ret = text.lower()
    strip = set(ret).difference(allowed)
    if " " in allowed:
        for s in strip:
            if s in ['\n', '\t']:
                ret = ret.replace(s, " ")
            else:
                ret = ret.replace(s, "")
    else:
        for s in strip:
            ret = ret.replace(s, "")
    return ret

alphabet = generate_alphabet('a', 'z') # Set of the english alphabet
allowed_characters = set([' '])
allowed_characters.update(alphabet)
for i in range(len(austen_text)):
    austen_text[i] = clean_text(austen_text[i], allowed_characters)

print("---")
print(austen_text[0][:600])
print("%i characters in training data" % np.sum([len(t) for t in austen_text]))

[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.

She was the youngest of the two daughters of a most affectionate,
indulgent father; and had, in consequence of her sister's marriage,
been mistress of his house from a very early period.  Her mother
had died too long ago for her to have more than an indistinct
remembrance of her caresses; and her place had b
---
emma by jane austen   volume i  chapter i   emma woodhouse handsome clever and rich with a comfortable home and happy disposition seemed to unite some of the best blessings of existence and had lived nearly twentyone years in the world with very little to distress or vex her  she was the youngest of the two daughters of a most affectionate indulgent father and had in consequence of her sister

With the data cleaned, we are ready to create some character ngram models. 

In [12]:
#spark_texts = spark.sparkContext.parallelize(austen_text)
from multiprocessing import Pool

with Pool() as p:
unigram_model = list(map(lambda data: NGramModel(list(data), 1), austen_text)))

TypeError: reduce() arg 2 must support iteration

In [7]:
print(unigram_model)
bigram_model = spark_texts.map(lambda data: NGramModel(list(data), 2)).reduce(lambda a, b: a.union(b))
print(bigram_model)
trigram_model = spark_texts.map(lambda data: NGramModel(list(data), 3)).reduce(lambda a, b: a.union(b))
print(trigram_model)
quadgram_model = spark_texts.map(lambda data: NGramModel(list(data), 4)).reduce(lambda a, b: a.union(b))
print(quadgram_model)

NameError: name 'spark' is not defined

In [None]:
print("unigram:", "".join(unigram_model.predict_sequence(90)))
print()
print("bigram:", "".join(bigram_model.predict_sequence(90)))
print()
print("trigram:", "".join(trigram_model.predict_sequence(90)))
print()
print("quadgram:", "".join(trigram_model.predict_sequence(90)))

We can now find the most common characters in this english text and their probabilities (from relative frequencies).

In [None]:
from ngram import ordered_ngrams
for unigram, prob in list(ordered_ngrams(unigram_model)):
    print("%s - %.5f" % (unigram, prob))