In [1]:
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten

In [3]:
text = [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']]
list(bigrams(text[0]))

[('a', 'b'), ('b', 'c')]

In [4]:
list(ngrams(text[1], n=3))

[('a', 'c', 'd'), ('c', 'd', 'c'), ('d', 'c', 'e'), ('c', 'e', 'f')]

In [6]:
from nltk.util import pad_sequence
list(pad_sequence(text[0], pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=2))

['<s>', 'a', 'b', 'c', '</s>']

In [7]:
padded_sent = list(pad_sequence(text[0], pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=2))
list(ngrams(padded_sent, n=2))

[('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]

In [8]:
list(pad_sequence(text[0], pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=3))

['<s>', '<s>', 'a', 'b', 'c', '</s>', '</s>']

In [9]:
padded_sent = list(pad_sequence(text[0], pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=3))
list(ngrams(padded_sent, n=3))

[('<s>', '<s>', 'a'),
 ('<s>', 'a', 'b'),
 ('a', 'b', 'c'),
 ('b', 'c', '</s>'),
 ('c', '</s>', '</s>')]

In [10]:
from nltk.lm.preprocessing import pad_both_ends
list(pad_both_ends(text[0], n=2))

['<s>', 'a', 'b', 'c', '</s>']

In [11]:
list(bigrams(pad_both_ends(text[0], n=2)))

[('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]

In [13]:
from nltk.util import everygrams
padded_bigrams = list(pad_both_ends(text[0], n=2))
list(everygrams(padded_bigrams, max_len=2))

[('<s>',),
 ('<s>', 'a'),
 ('a',),
 ('a', 'b'),
 ('b',),
 ('b', 'c'),
 ('c',),
 ('c', '</s>'),
 ('</s>',)]

In [14]:
from nltk.lm.preprocessing import flatten
list(flatten(pad_both_ends(sent, n=2) for sent in text))

['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']

In [15]:
from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(2, text)

In [17]:
training_ngrams, padded_sentences = padded_everygram_pipeline(2, text)
for ngramlize_sent in training_ngrams:
    print(list(ngramlize_sent))
    print()
print("#######")
list(padded_sentences)

[('<s>',), ('<s>', 'a'), ('a',), ('a', 'b'), ('b',), ('b', 'c'), ('c',), ('c', '</s>'), ('</s>',)]

[('<s>',), ('<s>', 'a'), ('a',), ('a', 'c'), ('c',), ('c', 'd'), ('d',), ('d', 'c'), ('c',), ('c', 'e'), ('e',), ('e', 'f'), ('f',), ('f', '</s>'), ('</s>',)]

#######


['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']

In [19]:
try:
    from nltk import word_tokenize, sent_tokenize
    word_tokenize(sent_tokenize("This is a foobar sentence. Yes it is.")[0])
except:
    import re
    from nltk.tokenize import ToktokTokenizer
    sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
    toktok = ToktokTokenizer()
    word_tokenize = toktok.tokenize

In [20]:
import os
import requests
import io

if os.path.isfile('language-never-random.txt'):
    with io.open('language-never-random.txt', encoding='utf8') as fin:
        text = fin.read()
else:
    url = "https://gist.githubusercontent.com/alvations/53b01e4076573fea47c6057120bb017a/raw/b01ff96a5f76848450e648f35da6497ca9454e4a/language-never-random.txt"
    text = requests.get(url).content.decode('utf8')
    with io.open('language-never-random.txt', 'w', encoding='utf8') as fout:
        fout.write(text)

In [21]:
tokenized_text = [list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(text)]

In [22]:
tokenized_text[0]

['language',
 'is',
 'never',
 ',',
 'ever',
 ',',
 'ever',
 ',',
 'random',
 'adam',
 'kilgarriff',
 'abstract',
 'language',
 'users',
 'never',
 'choose',
 'words',
 'randomly',
 ',',
 'and',
 'language',
 'is',
 'essentially',
 'non-random',
 '.']

In [23]:
print(text[:500])

                       Language is never, ever, ever, random

                                                               ADAM KILGARRIFF




Abstract
Language users never choose words randomly, and language is essentially
non-random. Statistical hypothesis testing uses a null hypothesis, which
posits randomness. Hence, when we look at linguistic phenomena in cor-
pora, the null hypothesis will never be true. Moreover, where there is enough
data, we shall (almost) always be able to establish 


In [27]:
n=3
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

In [28]:
from nltk.lm import MLE
model = MLE(n)

In [29]:
len(model.vocab)

0

In [31]:
model.fit(train_data, padded_sents)
print(model.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 1391 items>


In [32]:
print(model.vocab.lookup(tokenized_text[0]))

('language', 'is', 'never', ',', 'ever', ',', 'ever', ',', 'random', 'adam', 'kilgarriff', 'abstract', 'language', 'users', 'never', 'choose', 'words', 'randomly', ',', 'and', 'language', 'is', 'essentially', 'non-random', '.')


In [33]:
print(model.vocab.lookup("language is never random lah .".split()))

('language', 'is', 'never', 'random', '<UNK>', '.')


In [34]:
print(model.counts)

<NgramCounter with 3 ngram orders and 19611 ngrams>


In [35]:
model.counts["language"]

25

In [36]:
model.counts[["language"]]["is"]

11

In [37]:
model.counts[["language", "is"]]["never"]

7

In [38]:
model.score("language")

0.003691671588895452