In [1]:
from nltk.corpus import reuters
from collections import Counter

In [2]:
counts = Counter(reuters.words())

In [3]:
total_count = len(reuters.words())
total_count

1720901

#### The most common 20 words are ...

In [4]:
print (counts.most_common(n=20))

[('.', 94687), (',', 72360), ('the', 58251), ('of', 35979), ('to', 34035), ('in', 26478), ('said', 25224), ('and', 25043), ('a', 23492), ('mln', 18037), ('vs', 14120), ('-', 13705), ('for', 12785), ('dlrs', 11730), ("'", 11272), ('The', 10968), ('000', 10277), ('1', 9977), ('s', 9298), ('pct', 9093)]


In [5]:
# Compute the frequencies
for word in counts:
    counts[word] /= float(total_count)

In [6]:
# print a few dict items

i= 1
for k,v in counts.items():
    if i> 5:
        break
        
    print(k, v)
    i +=1

ASIAN 6.97309142129617e-06
EXPORTERS 2.673018378163532e-05
FEAR 1.1621819035493616e-06
DAMAGE 7.554182373070851e-06
FROM 0.00012086691796913361


In [7]:
# The frequencies should add up to 1
print (sum(counts.values()))  

1.0000000000006808


In [9]:
import random

In [10]:
# Generate 100 words of language
text = []
 
for _ in range(100):
    r = random.random()
    accumulator = .0
 
    for word, freq in counts.items():
        accumulator += freq
 
        if accumulator >= r:
            text.append(word)
            break

In [10]:
print (' '.join(text))

. . vs have official billion earned pct to rejected a Beryl at U CYCLOPS period imposing Association OECD in . to a NET bond , which S silver told , room to for falls lt Broadcasting s in dlrs WORKING target . earnings February tax , vs 10 under . . 147 said indicators who dwt 4TH they trimmed growth new affected shareholders said Securities 216 profit affiliated Year suspend & Net 1987 Bristol the . be vs s were from were Vertex . grounds vs Brasil & LEAD five mln compete rates KLM by risen at ; 6


it’s not the most expressive piece of content.

The produced text follows only the frequency rules of the language and nothing more.

we know the probability of all the words, we can compute the probability of a text. 

Because the words have been generated independently we just need to multiply all of the probabilities together:

In [11]:
# The probability of a text
from operator import mul
from functools import reduce

print (reduce(mul, [counts[w] for w in text]), 1.0) 
 

2.06677302923e-313 1.0


## Bigrams and Trigrams
make sure the new word goes well after the last word in the sequence (bigram model) or the last two words (trigram model).

“Bigram” is a fancy name for 2 consecutive words while trigram is a triplet of consecutive words. 

In [12]:
from nltk.corpus import reuters
from nltk import bigrams, trigrams
from collections import Counter, defaultdict

In [13]:
first_sentence = reuters.sents()[0]
print (first_sentence)

['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.', 'And', 'Japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'Asia', "'", 's', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far', '-', 'reaching', 'economic', 'damage', ',', 'businessmen', 'and', 'officials', 'said', '.']


In [14]:
# Get the bigrams
print (list(bigrams(first_sentence)))

[('ASIAN', 'EXPORTERS'), ('EXPORTERS', 'FEAR'), ('FEAR', 'DAMAGE'), ('DAMAGE', 'FROM'), ('FROM', 'U'), ('U', '.'), ('.', 'S'), ('S', '.-'), ('.-', 'JAPAN'), ('JAPAN', 'RIFT'), ('RIFT', 'Mounting'), ('Mounting', 'trade'), ('trade', 'friction'), ('friction', 'between'), ('between', 'the'), ('the', 'U'), ('U', '.'), ('.', 'S'), ('S', '.'), ('.', 'And'), ('And', 'Japan'), ('Japan', 'has'), ('has', 'raised'), ('raised', 'fears'), ('fears', 'among'), ('among', 'many'), ('many', 'of'), ('of', 'Asia'), ('Asia', "'"), ("'", 's'), ('s', 'exporting'), ('exporting', 'nations'), ('nations', 'that'), ('that', 'the'), ('the', 'row'), ('row', 'could'), ('could', 'inflict'), ('inflict', 'far'), ('far', '-'), ('-', 'reaching'), ('reaching', 'economic'), ('economic', 'damage'), ('damage', ','), (',', 'businessmen'), ('businessmen', 'and'), ('and', 'officials'), ('officials', 'said'), ('said', '.')]


In [15]:
# Get the padded bigrams
print (list(bigrams(first_sentence, pad_left=True, pad_right=True)) )

[(None, 'ASIAN'), ('ASIAN', 'EXPORTERS'), ('EXPORTERS', 'FEAR'), ('FEAR', 'DAMAGE'), ('DAMAGE', 'FROM'), ('FROM', 'U'), ('U', '.'), ('.', 'S'), ('S', '.-'), ('.-', 'JAPAN'), ('JAPAN', 'RIFT'), ('RIFT', 'Mounting'), ('Mounting', 'trade'), ('trade', 'friction'), ('friction', 'between'), ('between', 'the'), ('the', 'U'), ('U', '.'), ('.', 'S'), ('S', '.'), ('.', 'And'), ('And', 'Japan'), ('Japan', 'has'), ('has', 'raised'), ('raised', 'fears'), ('fears', 'among'), ('among', 'many'), ('many', 'of'), ('of', 'Asia'), ('Asia', "'"), ("'", 's'), ('s', 'exporting'), ('exporting', 'nations'), ('nations', 'that'), ('that', 'the'), ('the', 'row'), ('row', 'could'), ('could', 'inflict'), ('inflict', 'far'), ('far', '-'), ('-', 'reaching'), ('reaching', 'economic'), ('economic', 'damage'), ('damage', ','), (',', 'businessmen'), ('businessmen', 'and'), ('and', 'officials'), ('officials', 'said'), ('said', '.'), ('.', None)]


In [16]:
# Get the trigrams
print (list(trigrams(first_sentence)))

[('ASIAN', 'EXPORTERS', 'FEAR'), ('EXPORTERS', 'FEAR', 'DAMAGE'), ('FEAR', 'DAMAGE', 'FROM'), ('DAMAGE', 'FROM', 'U'), ('FROM', 'U', '.'), ('U', '.', 'S'), ('.', 'S', '.-'), ('S', '.-', 'JAPAN'), ('.-', 'JAPAN', 'RIFT'), ('JAPAN', 'RIFT', 'Mounting'), ('RIFT', 'Mounting', 'trade'), ('Mounting', 'trade', 'friction'), ('trade', 'friction', 'between'), ('friction', 'between', 'the'), ('between', 'the', 'U'), ('the', 'U', '.'), ('U', '.', 'S'), ('.', 'S', '.'), ('S', '.', 'And'), ('.', 'And', 'Japan'), ('And', 'Japan', 'has'), ('Japan', 'has', 'raised'), ('has', 'raised', 'fears'), ('raised', 'fears', 'among'), ('fears', 'among', 'many'), ('among', 'many', 'of'), ('many', 'of', 'Asia'), ('of', 'Asia', "'"), ('Asia', "'", 's'), ("'", 's', 'exporting'), ('s', 'exporting', 'nations'), ('exporting', 'nations', 'that'), ('nations', 'that', 'the'), ('that', 'the', 'row'), ('the', 'row', 'could'), ('row', 'could', 'inflict'), ('could', 'inflict', 'far'), ('inflict', 'far', '-'), ('far', '-', 'rea

In [17]:
# Get the padded trigrams
print (list(trigrams(first_sentence, pad_left=True, pad_right=True)))

[(None, None, 'ASIAN'), (None, 'ASIAN', 'EXPORTERS'), ('ASIAN', 'EXPORTERS', 'FEAR'), ('EXPORTERS', 'FEAR', 'DAMAGE'), ('FEAR', 'DAMAGE', 'FROM'), ('DAMAGE', 'FROM', 'U'), ('FROM', 'U', '.'), ('U', '.', 'S'), ('.', 'S', '.-'), ('S', '.-', 'JAPAN'), ('.-', 'JAPAN', 'RIFT'), ('JAPAN', 'RIFT', 'Mounting'), ('RIFT', 'Mounting', 'trade'), ('Mounting', 'trade', 'friction'), ('trade', 'friction', 'between'), ('friction', 'between', 'the'), ('between', 'the', 'U'), ('the', 'U', '.'), ('U', '.', 'S'), ('.', 'S', '.'), ('S', '.', 'And'), ('.', 'And', 'Japan'), ('And', 'Japan', 'has'), ('Japan', 'has', 'raised'), ('has', 'raised', 'fears'), ('raised', 'fears', 'among'), ('fears', 'among', 'many'), ('among', 'many', 'of'), ('many', 'of', 'Asia'), ('of', 'Asia', "'"), ('Asia', "'", 's'), ("'", 's', 'exporting'), ('s', 'exporting', 'nations'), ('exporting', 'nations', 'that'), ('nations', 'that', 'the'), ('that', 'the', 'row'), ('the', 'row', 'could'), ('row', 'could', 'inflict'), ('could', 'inflict

build a trigram model from the Reuters corpus.

In [18]:
model = defaultdict(lambda: defaultdict(lambda: 0))

In [19]:
for sentence in reuters.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1

In [20]:
i = 1 
for k, v in model.items():
    if i> 2:
        break
    
    print(k, ':', v)
    print('\n')
    
    i+=1

(None, None) : defaultdict(<function <lambda>.<locals>.<lambda> at 0x000001B2D472FEA0>, {'ASIAN': 4, 'They': 446, 'But': 1054, 'The': 8839, 'Unofficial': 1, '"': 3589, 'In': 1380, 'Threat': 2, 'Taiwan': 38, 'Retaliation': 3, 'A': 764, 'Last': 202, 'Much': 8, 'He': 1586, 'Meanwhile': 41, 'Japan': 111, 'Deputy': 8, 'CHINA': 50, 'It': 1768, 'JAPAN': 164, 'MITI': 12, 'Nuclear': 1, 'THAI': 19, 'Thailand': 13, 'Export': 12, 'Products': 4, 'INDONESIA': 21, 'Prices': 43, 'Harahap': 2, 'Indonesia': 34, 'Indonesian': 9, 'AUSTRALIAN': 37, 'Cargo': 1, 'INDONESIAN': 12, 'Trading': 24, 'Physical': 2, 'Rubber': 3, 'Robusta': 2, 'No': 92, 'Trade': 52, 'Nainggolan': 1, 'Officials': 58, 'Transactions': 2, 'Total': 111, 'SRI': 9, 'WESTERN': 8, 'Bundey': 1, 'Annual': 5, 'SUMITOMO': 4, 'Osaka': 1, 'Some': 177, 'Others': 11, 'Now': 21, 'Among': 57, 'Regulations': 1, 'We': 73, 'Komatsu': 1, 'Article': 2, 'That': 104, 'Until': 15, 'Like': 6, 'SUBROTO': 3, 'Asked': 127, 'BUNDESBANK': 24, 'Banks': 34, 'Dealers'

In [23]:
print (model["what", "the"]["economists"])      # "economists" follows "what the" 2 times
print (model["what", "the"]["nonexistingword"]) # 0 times
print (model[None, None]["The"])                # 8839 sentences start with "The"

2
0
8839


In [24]:
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [25]:
print (model["what", "the"]["economists"])      
print (model["what", "the"]["nonexistingword"]) 
print (model[None, None]["The"])                

0.043478260869565216
0.0
0.16154324146501936


Let’s generate some text:

In [26]:
import random
 
text = [None, None]
 
sentence_finished = False
 
while not sentence_finished:
    r = random.random()
    accumulator = .0
 
    for word in model[tuple(text[-2:])].keys():
        accumulator += model[tuple(text[-2:])][word]
 
        if accumulator >= r:
            text.append(word)
            break
 
    if text[-2:] == [None, None]:
        sentence_finished = True
 
print (' '.join([t for t in text if t]))

A jury trial has been among the weaker dollar .
