### Imports

In [48]:
import nltk
from collections import Counter, defaultdict
import numpy as np
from nltk import bigrams, trigrams


### Load corpus

In [7]:
nltk.download('reuters')

[nltk_data] Downloading package reuters to /root/nltk_data...


True

In [52]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
from nltk.corpus import reuters

### Most commons

In [10]:
counts = Counter(reuters.words())
print(counts.most_common(n=20))


[('.', 94687), (',', 72360), ('the', 58251), ('of', 35979), ('to', 34035), ('in', 26478), ('said', 25224), ('and', 25043), ('a', 23492), ('mln', 18037), ('vs', 14120), ('-', 13705), ('for', 12785), ('dlrs', 11730), ("'", 11272), ('The', 10968), ('000', 10277), ('1', 9977), ('s', 9298), ('pct', 9093)]


In [12]:
corpus = reuters.words()

### Functions

In [46]:
def normalize(corpus):
    """
    Normlize the corpus with frequencies
    """
    total_count = len(corpus)
    counts = Counter(corpus)

    for word in counts:
        counts[word] /= float(total_count)
    
    return counts

def generate_text_frequency(counts,size=100):
    """
    Generate text with unigram language model
    """
    text = []
    
    for _ in range(size):
        r = random.random()
        accumulator = .0

        for word in counts:
            accumulator += counts[word]
            if accumulator >= r:
                text.append(word)
                break

#     print (' '.join(text))
    return text

### Generate text with frequencies

In [13]:
counts = normalize(corpus)

In [17]:
print(sum(counts.values()))

1.0000000000006808


In [47]:
generated_text = generate_text_frequency(counts,100)

In [45]:
# probability of the text
probs = [counts[word] for word in generated_text]
np.prod(probs)

1.6421739111297019e-307

### Bigrams and Trigrams  

In [53]:
first_sentence = reuters.sents()[0]
print(first_sentence)

['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.', 'And', 'Japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'Asia', "'", 's', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far', '-', 'reaching', 'economic', 'damage', ',', 'businessmen', 'and', 'officials', 'said', '.']


In [54]:
print(list(bigrams(first_sentence)))

[('ASIAN', 'EXPORTERS'), ('EXPORTERS', 'FEAR'), ('FEAR', 'DAMAGE'), ('DAMAGE', 'FROM'), ('FROM', 'U'), ('U', '.'), ('.', 'S'), ('S', '.-'), ('.-', 'JAPAN'), ('JAPAN', 'RIFT'), ('RIFT', 'Mounting'), ('Mounting', 'trade'), ('trade', 'friction'), ('friction', 'between'), ('between', 'the'), ('the', 'U'), ('U', '.'), ('.', 'S'), ('S', '.'), ('.', 'And'), ('And', 'Japan'), ('Japan', 'has'), ('has', 'raised'), ('raised', 'fears'), ('fears', 'among'), ('among', 'many'), ('many', 'of'), ('of', 'Asia'), ('Asia', "'"), ("'", 's'), ('s', 'exporting'), ('exporting', 'nations'), ('nations', 'that'), ('that', 'the'), ('the', 'row'), ('row', 'could'), ('could', 'inflict'), ('inflict', 'far'), ('far', '-'), ('-', 'reaching'), ('reaching', 'economic'), ('economic', 'damage'), ('damage', ','), (',', 'businessmen'), ('businessmen', 'and'), ('and', 'officials'), ('officials', 'said'), ('said', '.')]


In [55]:
print(list(bigrams(first_sentence, pad_left=True, pad_right=True)))

[(None, 'ASIAN'), ('ASIAN', 'EXPORTERS'), ('EXPORTERS', 'FEAR'), ('FEAR', 'DAMAGE'), ('DAMAGE', 'FROM'), ('FROM', 'U'), ('U', '.'), ('.', 'S'), ('S', '.-'), ('.-', 'JAPAN'), ('JAPAN', 'RIFT'), ('RIFT', 'Mounting'), ('Mounting', 'trade'), ('trade', 'friction'), ('friction', 'between'), ('between', 'the'), ('the', 'U'), ('U', '.'), ('.', 'S'), ('S', '.'), ('.', 'And'), ('And', 'Japan'), ('Japan', 'has'), ('has', 'raised'), ('raised', 'fears'), ('fears', 'among'), ('among', 'many'), ('many', 'of'), ('of', 'Asia'), ('Asia', "'"), ("'", 's'), ('s', 'exporting'), ('exporting', 'nations'), ('nations', 'that'), ('that', 'the'), ('the', 'row'), ('row', 'could'), ('could', 'inflict'), ('inflict', 'far'), ('far', '-'), ('-', 'reaching'), ('reaching', 'economic'), ('economic', 'damage'), ('damage', ','), (',', 'businessmen'), ('businessmen', 'and'), ('and', 'officials'), ('officials', 'said'), ('said', '.'), ('.', None)]


In [56]:
print(list(trigrams(first_sentence)))

[('ASIAN', 'EXPORTERS', 'FEAR'), ('EXPORTERS', 'FEAR', 'DAMAGE'), ('FEAR', 'DAMAGE', 'FROM'), ('DAMAGE', 'FROM', 'U'), ('FROM', 'U', '.'), ('U', '.', 'S'), ('.', 'S', '.-'), ('S', '.-', 'JAPAN'), ('.-', 'JAPAN', 'RIFT'), ('JAPAN', 'RIFT', 'Mounting'), ('RIFT', 'Mounting', 'trade'), ('Mounting', 'trade', 'friction'), ('trade', 'friction', 'between'), ('friction', 'between', 'the'), ('between', 'the', 'U'), ('the', 'U', '.'), ('U', '.', 'S'), ('.', 'S', '.'), ('S', '.', 'And'), ('.', 'And', 'Japan'), ('And', 'Japan', 'has'), ('Japan', 'has', 'raised'), ('has', 'raised', 'fears'), ('raised', 'fears', 'among'), ('fears', 'among', 'many'), ('among', 'many', 'of'), ('many', 'of', 'Asia'), ('of', 'Asia', "'"), ('Asia', "'", 's'), ("'", 's', 'exporting'), ('s', 'exporting', 'nations'), ('exporting', 'nations', 'that'), ('nations', 'that', 'the'), ('that', 'the', 'row'), ('the', 'row', 'could'), ('row', 'could', 'inflict'), ('could', 'inflict', 'far'), ('inflict', 'far', '-'), ('far', '-', 'rea

In [57]:
print(list(trigrams(first_sentence, pad_left=True, pad_right=True)))

[(None, None, 'ASIAN'), (None, 'ASIAN', 'EXPORTERS'), ('ASIAN', 'EXPORTERS', 'FEAR'), ('EXPORTERS', 'FEAR', 'DAMAGE'), ('FEAR', 'DAMAGE', 'FROM'), ('DAMAGE', 'FROM', 'U'), ('FROM', 'U', '.'), ('U', '.', 'S'), ('.', 'S', '.-'), ('S', '.-', 'JAPAN'), ('.-', 'JAPAN', 'RIFT'), ('JAPAN', 'RIFT', 'Mounting'), ('RIFT', 'Mounting', 'trade'), ('Mounting', 'trade', 'friction'), ('trade', 'friction', 'between'), ('friction', 'between', 'the'), ('between', 'the', 'U'), ('the', 'U', '.'), ('U', '.', 'S'), ('.', 'S', '.'), ('S', '.', 'And'), ('.', 'And', 'Japan'), ('And', 'Japan', 'has'), ('Japan', 'has', 'raised'), ('has', 'raised', 'fears'), ('raised', 'fears', 'among'), ('fears', 'among', 'many'), ('among', 'many', 'of'), ('many', 'of', 'Asia'), ('of', 'Asia', "'"), ('Asia', "'", 's'), ("'", 's', 'exporting'), ('s', 'exporting', 'nations'), ('exporting', 'nations', 'that'), ('nations', 'that', 'the'), ('that', 'the', 'row'), ('the', 'row', 'could'), ('row', 'could', 'inflict'), ('could', 'inflict

In [58]:
model = defaultdict(lambda: defaultdict(lambda: 0))
 
for sentence in reuters.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1

In [65]:
model["what","the"]

defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
            {'communique': 1,
             'chance': 1,
             'manager': 1,
             'central': 2,
             'announcement': 1,
             'minimum': 1,
             'administration': 1,
             'United': 1,
             'U': 3,
             'decision': 3,
             'Japanese': 1,
             'MPs': 1,
             'differential': 1,
             'British': 2,
             'Paris': 1,
             'state': 1,
             'Buffer': 1,
             'markets': 1,
             'Fed': 2,
             'company': 1,
             'offer': 1,
             'result': 1,
             'pressure': 1,
             'outcome': 1,
             'union': 1,
             'financial': 1,
             'government': 2,
             'parameters': 1,
             'economists': 2,
             'trade': 1,
             'chief': 1,
             'Administration': 1,
             'group': 1,
             'equity': 1,
            

In [61]:
print(model["what", "the"]["economists"]) # "economists" follows "what the" 
print(model["what", "the"]["fact"]) # 
print(model[None, None]["The"]) # start with "The"

2
0
8839


In [66]:
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [67]:
model["what", "the"]

defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
            {'communique': 0.021739130434782608,
             'chance': 0.021739130434782608,
             'manager': 0.021739130434782608,
             'central': 0.043478260869565216,
             'announcement': 0.021739130434782608,
             'minimum': 0.021739130434782608,
             'administration': 0.021739130434782608,
             'United': 0.021739130434782608,
             'U': 0.06521739130434782,
             'decision': 0.06521739130434782,
             'Japanese': 0.021739130434782608,
             'MPs': 0.021739130434782608,
             'differential': 0.021739130434782608,
             'British': 0.043478260869565216,
             'Paris': 0.021739130434782608,
             'state': 0.021739130434782608,
             'Buffer': 0.021739130434782608,
             'markets': 0.021739130434782608,
             'Fed': 0.043478260869565216,
             'company': 0.021739130434782608,
             'offer

In [69]:
print(model["what", "the"]["economists"]) 
print(model["what", "the"]["fact"]) 
print(model[None, None]["The"])

0.043478260869565216
0.0
0.16154324146501936


In [72]:
text = [None, None]
 
sentence_finished = False
 
while not sentence_finished:
    r = random.random()
    accumulator = .0
 
    for word in model[tuple(text[-2:])].keys():
        accumulator += model[tuple(text[-2:])][word]
 
        if accumulator >= r:
            text.append(word)
            break
 
    if text[-2:] == [None, None]:
        sentence_finished = True


print(' '.join([t for t in text if t]))
 

A further 500 mln dlrs of bids on a number of shares of common stock directly from Cyclops or buy Cyclops Corp to 1 . 66 Richardson Electronics shr 59 cts Net 2 , 194 , 000 vs loss seven cts shr from sale of the U . S .
