# Download library and sample corpus

In [1]:
import nltk
nltk.download('reuters')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
from nltk.corpus import reuters, stopwords
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
import string

# Process the corpus and obtain a simple model

How to process text?

In [3]:
# original sentence
sentence = "This is a sample sentence to show what n-grams are."
print("Original sentence:", sentence)

# remove punctuation
translator = str.maketrans('', '', string.punctuation)
sentence = sentence.translate(translator)
print("Processed sentence:", sentence)

# tokenizing sentence
tokens = nltk.word_tokenize(sentence)
print("Tokenized sentence:", tokens)

Original sentence: This is a sample sentence to show what n-grams are.
Processed sentence: This is a sample sentence to show what ngrams are
Tokenized sentence: ['This', 'is', 'a', 'sample', 'sentence', 'to', 'show', 'what', 'ngrams', 'are']


In [4]:
print("Bigrams:")
for n1, n2 in bigrams(tokens):
  print("(", n1 + ", " + n2, ")")

print("\nTrigrams:")
for n1, n2, n3 in trigrams(tokens):
  print("(", n1 + ", " + n2 + ", " + n3, ")")

Bigrams:
( This, is )
( is, a )
( a, sample )
( sample, sentence )
( sentence, to )
( to, show )
( show, what )
( what, ngrams )
( ngrams, are )

Trigrams:
( This, is, a )
( is, a, sample )
( a, sample, sentence )
( sample, sentence, to )
( sentence, to, show )
( to, show, what )
( show, what, ngrams )
( what, ngrams, are )


In [17]:
# Create a container for the model
model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance (we will use tri-grams, the third word will be the prediction for a pair of words)
for sentence in reuters.sents():
    new_sentence = [word for word in sentence if word.isalnum()]
    for w1, w2, w3 in trigrams(new_sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1
 
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [18]:
dict(model["the","home"])

{'and': 0.07142857142857142,
 'healthcare': 0.07142857142857142,
 'market': 0.7142857142857143,
 'mortgage': 0.07142857142857142,
 'shopping': 0.07142857142857142}

In [24]:
dict(model["The","state"])

{'government': 0.043478260869565216,
 'must': 0.08695652173913043,
 'of': 0.043478260869565216,
 'offers': 0.043478260869565216,
 'oil': 0.17391304347826086,
 'owned': 0.34782608695652173,
 'procurement': 0.043478260869565216,
 'run': 0.043478260869565216,
 's': 0.043478260869565216,
 'showing': 0.043478260869565216,
 'then': 0.043478260869565216,
 'treasury': 0.043478260869565216}

In [8]:
import random
def predict(text, prob=-1):
  text = [word for word in nltk.word_tokenize(text) if word.isalnum()]
  sentence_finished = False
  
  while not sentence_finished:
    # select a random probability threshold
    r = random.random() if prob == -1 else prob
    accumulator = .0
    for word in model[tuple(text[-2:])].keys():
        accumulator += model[tuple(text[-2:])][word]
        # select words that are above the probability threshold
        if accumulator >= r:
            text.append(word)
            break

    if text[-2:] == [None, None]:
        sentence_finished = True
 
  return' '.join([t for t in text if t])

In [9]:
predict("The market has")

'The market has been reluctant to provide cash incentives so a change in the size of the micro mini and mainframe computer markets'