In [2]:
import os, re, sys
from collections import Counter, defaultdict, OrderedDict
pat_re=re.compile("([.?!\;`,\—\-'\’\"\“])") #some special characters

#### NOTE: 
The Raven is a classic poem by Edgar Allen Poe.
Download the file from 'https://www.dropbox.com/s/2behj1yr7zav7sf/the_raven.txt?dl=0' 
and save it to the current directory.

[You can listen to Lou Reed reading the poem: https://open.spotify.com/track/3wJuirpgmeS9gD5azqENaS?si=6ecb5a54231f4846
or by James Earl Jones: https://www.youtube.com/watch?v=WcqPQXqQXzI]

In [8]:
fn = 'corpus/the_raven.txt'
with open(fn, encoding='utf-8') as fin:
    text = fin.read()

#### Basic text normalization

This can be done in many different ways, achieving different results.
    

In [9]:
text = text.lower()
text = pat_re.sub(" \\1 ", text) #padding special chars with a white space
text = re.sub('\s+',' ', text)  #removing sequences white spaces
text = pat_re.sub('', text) #removing punctuation and special chars

#### Simple Nested Data Structure for a Language Model

The following code block creates a very simple datastracture to be used in a language model. Notice the nested structure. Think what are the benefits of this structure. 
(also notice the use of the lambda function)

In [10]:
def get_vocab(text): return re.findall(r'\w+', text)

vocab_counter = Counter(get_vocab(text))
vocab = list(vocab_counter)

ngrams = defaultdict(int)
contexts = defaultdict(int)
contexts_d = defaultdict(lambda: defaultdict(int))
n = 3
words = text.split()
for i in range(len(words)-n+1):
    ngrams[' '.join(words[i:i+n])]+=1
    contexts[' '.join(words[i:i+n-1])]+=1
    contexts_d[' '.join(words[i:i+n-1])][words[i+n-1]]+=1

In [11]:
s = 'quoth the'
print('s=%s: %d\t%s' %(s,contexts[s], '#times s appears as a context'))

s = 'quoth the raven'
print('s=%s: %d\t%s' %(s,ngrams[s], '#times s appears as a full ngram'))


s = 'The most frequent ngram is: '
print(s, '\'',max(ngrams, key=ngrams.get),'\'', ', freq:', ngrams[max(ngrams, key=ngrams.get)])

s = 'The most frequent context is: '
print(s, '\'',max(contexts, key=contexts.get),'\'', ', freq:', contexts[max(contexts, key=contexts.get)])


s=quoth the: 5	#times s appears as a context
s=quoth the raven: 5	#times s appears as a full ngram
The most frequent ngram is:  ' my chamber door ' , freq: 8
The most frequent context is:  ' chamber door ' , freq: 10


#### A peek  into the nested structure

In [12]:
contexts_d['chamber door']

defaultdict(int,
            {'tis': 1,
             'only': 1,
             'some': 1,
             'this': 1,
             'that': 1,
             'perched': 2,
             'bird': 1,
             'with': 1,
             'and': 1})

In [13]:
contexts_d

defaultdict(<function __main__.<lambda>()>,
            {'once upon': defaultdict(int, {'a': 1}),
             'upon a': defaultdict(int, {'midnight': 1, 'bust': 1}),
             'a midnight': defaultdict(int, {'dreary': 1}),
             'midnight dreary': defaultdict(int, {'while': 1}),
             'dreary while': defaultdict(int, {'i': 1}),
             'while i': defaultdict(int, {'pondered': 1, 'nodded': 1}),
             'i pondered': defaultdict(int, {'weak': 1}),
             'pondered weak': defaultdict(int, {'and': 1}),
             'weak and': defaultdict(int, {'weary': 1}),
             'and weary': defaultdict(int, {'over': 1}),
             'weary over': defaultdict(int, {'many': 1}),
             'over many': defaultdict(int, {'a': 1}),
             'many a': defaultdict(int, {'quaint': 1, 'flirt': 1}),
             'a quaint': defaultdict(int, {'and': 1}),
             'quaint and': defaultdict(int, {'curious': 1}),
             'and curious': defaultdict(int, {'volum