### Bag of Words (BoW)

In [1]:
from nltk.corpus import reuters
from collections import Counter

counts = Counter(reuters.words())
total_count = len(reuters.words())

# The most 20 common words are:
print(counts.most_common(n=20))

[('.', 94687), (',', 72360), ('the', 58251), ('of', 35979), ('to', 34035), ('in', 26478), ('said', 25224), ('and', 25043), ('a', 23492), ('mln', 18037), ('vs', 14120), ('-', 13705), ('for', 12785), ('dlrs', 11730), ("'", 11272), ('The', 10968), ('000', 10277), ('1', 9977), ('s', 9298), ('pct', 9093)]


In [2]:
a = counts.most_common(n=1)
biggest_frequency = a[0][1]

In [3]:
#Compute the frequencies
freq = {}
for word in counts:
    freq[word] = counts[word] / float(total_count)
    
print(total_count)
print(sum(counts.values()))
print(sum(freq.values())) # Add up to 1

1720901
1720901
1.0000000000006808


In [4]:
import random
# Generate 100 words of language
text = []
 
for _ in range(100):
    r = random.randint(0,total_count)
    accumulator = .0
 
    for word, freq in counts.items():
        accumulator += freq
 
        if accumulator >= r:
            text.append(word)
            break

print(' '.join(text))

around ' ball some > ; has lt / exports markets eight 908 bond 50 with TRADE and cts U of working . of Anheuser roughly no Witter . 9 31 . de the committee mln cents Secretary or now it the residual 1986 6 and it preparing will 542 NOTE , national reinstate he of COMMERCIAL rights as problem to > big 3RD 86 72 imbalance world . consistent operations . think A interim . 2 have / matter keep lend by rise QTR Enhancement Shr closing and 88 50 Union of Grumman INTERCEP traders 06 . , long


### n-grams

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/gustavomoura/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
def print_phrase(obj):
    print(' '.join(list(zip(*obj))[0]))

In [7]:
from nltk.corpus import reuters
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
 
first_sentence = reuters.sents()[0]
print(' '.join(first_sentence))
print('\n'*2)

# Get the bigrams
bi = list(bigrams(first_sentence))
print(bi[0:5])
print('\n'*2)

# Get the padded bigrams
pad_bi = list(bigrams(first_sentence, pad_left=True, pad_right=True))
print(pad_bi[0:5])
print('\n'*2)

# Get the trigrams
tri = list(trigrams(first_sentence))
print(tri[0:3])
print('\n'*2)

# Get the padded trigrams
pad_tri = list(trigrams(first_sentence, pad_left=True, pad_right=True))
print(pad_tri[0:3])

ASIAN EXPORTERS FEAR DAMAGE FROM U . S .- JAPAN RIFT Mounting trade friction between the U . S . And Japan has raised fears among many of Asia ' s exporting nations that the row could inflict far - reaching economic damage , businessmen and officials said .



[('ASIAN', 'EXPORTERS'), ('EXPORTERS', 'FEAR'), ('FEAR', 'DAMAGE'), ('DAMAGE', 'FROM'), ('FROM', 'U')]



[(None, 'ASIAN'), ('ASIAN', 'EXPORTERS'), ('EXPORTERS', 'FEAR'), ('FEAR', 'DAMAGE'), ('DAMAGE', 'FROM')]



[('ASIAN', 'EXPORTERS', 'FEAR'), ('EXPORTERS', 'FEAR', 'DAMAGE'), ('FEAR', 'DAMAGE', 'FROM')]



[(None, None, 'ASIAN'), (None, 'ASIAN', 'EXPORTERS'), ('ASIAN', 'EXPORTERS', 'FEAR')]


In [8]:
model = defaultdict(lambda: defaultdict(lambda: 0))
 
for sentence in reuters.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1
 
 
print(model["what", "the"]["economists"]) # "economists" follows "what the" 2 times
print(model["what", "the"]["nonexistingword"]) # 0 times
print(model[None, None]["The"]) # 8839 sentences start with "The"
 
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

print(model["what", "the"]["economists"]) # 0.0434782608696
print(model["what", "the"]["nonexistingword"]) # 0.0
print(model[None, None]["The"]) # 0.161543241465

2
0
8839
0.043478260869565216
0.0
0.16154324146501936


In [9]:
import random
 

text = [None, None]
 
sentence_finished = False
 
while not sentence_finished:
    r = random.random()
    accumulator = .0
 
    for word in model[tuple(text[-2:])].keys():
        accumulator += model[tuple(text[-2:])][word]
 
        if accumulator >= r:
            text.append(word)
            break
 
    if text[-2:] == [None, None]:
        sentence_finished = True

print(' '.join([t for t in text if t]))
 

Goodrich said Moore is still waiting for responses from creditors foreclosing on the company under a 1985 farm bill is to wean it ."


In [10]:
text = [None, None]
prob = 1.0  # <- Init probability
 
sentence_finished = False
 
while not sentence_finished:
    r = random.random()
    accumulator = .0
 
    for word in model[tuple(text[-2:])].keys():
        accumulator += model[tuple(text[-2:])][word]
 
        if accumulator >= r:
            prob *= model[tuple(text[-2:])][word]  # <- Update the probability with the conditional probability of the new word
            text.append(word)
            break
 
    if text[-2:] == [None, None]:
        sentence_finished = True

print("Probability of text=", prob)  # <- Print the probability of the text
print(' '.join([t for t in text if t]))

Probability of text= 1.9880698209532491e-60
MEXICAN CATTLE IMPORTS TO BE RENEGOTIATED The International Cocoa Organization ( ICCO ) buffer stock of American Realty Trust said that the Bank injects liquidity into the auction at the Commerce Department and the strong yen has hit an air pocket of weakness are West Germany - had intervened aggressively since the Government of Alberta ' s Engine Parts Ltd , is flavor of the company ' s membership of commodity operations .
