In [2]:
% reset -f

In [10]:
from __future__ import print_function

import itertools

# download punkt model for sentence tokenizer
# using nltk.download()
from nltk.tokenize import sent_tokenize, word_tokenize

from my_nltk.model.counter import build_vocabulary
from entropy_functions import clean_text

# english corpus
with open('../data/en.txt') as corpus_file:
    input_text = corpus_file.read()

# tokenize
# run experiment with data cleaned
sents = [word_tokenize(clean_text(sent)) for sent in sent_tokenize(input_text)]
# run experiment without cleaning the data
#sents = [word_tokenize(sent) for sent in sent_tokenize(input_text)]

# create a vocabulary (it could be used for filtering unfrequent words)
vocabulary = build_vocabulary(1, itertools.chain(*sents))

In [11]:
from my_nltk.model.counter import count_ngrams
from my_nltk.model.ngram import NgramModel

n_gram = {}
for n in [1, 2, 3]:
    # store models for testing
    n_gram[n] = NgramModel(count_ngrams(n, vocabulary, sents))
    print("{} corpus using {}-gram. H: {:.3f} perplexity: {:.3f}".format('en.txt', n, n_gram[n].entropy(), n_gram[n].perplexity()))

en.txt corpus using 1-gram. H: 10.560 perplexity: 1509.917
en.txt corpus using 2-gram. H: 6.064 perplexity: 66.881
en.txt corpus using 3-gram. H: 2.106 perplexity: 4.306


In [None]:
#results with method 2
#en.txt corpus using 1-gram. H: 10.560 perplexity: 1509.917
#en.txt corpus using 2-gram. H: 5.884 perplexity: 59.038
#en.txt corpus using 3-gram. H: 2.182 perplexity: 4.538
% reset -f

In [1]:
from __future__ import print_function

from entropy_functions import get_tagged_words_from_file, get_words_from_tagged_sents
from entropy_functions import get_tagged_sents, clean_tagged_sent
from my_nltk.model.counter import build_vocabulary

# create one unique sentence from the file
#tagged_words = get_tagged_words_from_file('../data/taggedBrown.txt')
#tagged_sents = [clean_tagged_sent(tagged_words)]

# use sentences from the news category in the brown corpus 
# run experiment with data cleaned
tagged_sents = [clean_tagged_sent(tagged_sent) for tagged_sent in get_tagged_sents()]

# run experiment without cleaning the data
#tagged_sents = [tagged_sent for tagged_sent in get_tagged_sents()]

# define/override words using the cleaned sentences
tagged_words = get_words_from_tagged_sents(tagged_sents)

# create a vocabulary (it could be used for filtering unfrequent words)
vocabulary = build_vocabulary(1, tagged_words)

In [2]:
from my_nltk.model.counter import smooth_count_ngrams

# playing around with n-grams

# number n for creating a n-gram
n = 3
# number of components to be smoothed. it can take
# values in [0, n]
smooth = 1
cnt = smooth_count_ngrams(n, vocabulary, smooth, [])
# show 3-grams for the first sentence 
cnt.to_ngrams(tagged_sents[0])

[('<s>', '<s>', 'the'),
 ('<s>', 'the', 'fulton'),
 ('<DET>', 'fulton', 'county'),
 ('<NOUN>', 'county', 'grand'),
 ('<NOUN>', 'grand', 'jury'),
 ('<ADJ>', 'jury', 'said'),
 ('<NOUN>', 'said', 'friday'),
 ('<VERB>', 'friday', 'an'),
 ('<NOUN>', 'an', 'investigation'),
 ('<DET>', 'investigation', 'of'),
 ('<NOUN>', 'of', 'atlantas'),
 ('<ADP>', 'atlantas', 'recent'),
 ('<NOUN>', 'recent', 'primary'),
 ('<ADJ>', 'primary', 'election'),
 ('<NOUN>', 'election', 'produced'),
 ('<NOUN>', 'produced', 'no'),
 ('<VERB>', 'no', 'evidence'),
 ('<DET>', 'evidence', 'that'),
 ('<NOUN>', 'that', 'any'),
 ('<ADP>', 'any', 'irregularities'),
 ('<DET>', 'irregularities', 'took'),
 ('<NOUN>', 'took', 'place'),
 ('<VERB>', 'place', '</s>'),
 ('<NOUN>', '</s>', '</s>')]

In [3]:
import nltk
from nltk.util import ngrams

from my_nltk.model.counter import smooth_count_ngrams
from my_nltk.model.ngram import NgramModel

import itertools

msg = "{corpus} corpus using {n}-gram with {smooth} smoothed components. H: {entropy:.6f} Perplexity: {perplexity:.6f}"

n=3
brown_trigram = {}
for corpus_size in [1, 2, 4]:
    
    # select a proportion of the sentences of the corpus (keep words proportion) 
    sents_ratio = 1./corpus_size

    if len(tagged_sents) == 1:
        selected_tagged_sents = [tagged_sents[0][:int(len(tagged_words) * sents_ratio)]]
    else:
        selected_tagged_sents = tagged_sents[:int(len(tagged_sents) * sents_ratio)]

    words_ratio = 1. * len(list(itertools.chain(*selected_tagged_sents))) / len(tagged_words)
    print("#Sentences ratio: {:.3f} #Words ratio: {:.3f}".format(sents_ratio, words_ratio))
        
    for smooth in [0, 1, 2]:        
        # store models for testing
        brown_trigram[(corpus_size, smooth)] = NgramModel(smooth_count_ngrams(n, vocabulary, 
                                                                              smooth, selected_tagged_sents))
        
        print(msg.format(n=n, corpus='Brown', smooth=smooth, 
                         entropy=brown_trigram[(corpus_size, smooth)].entropy(), 
                         perplexity=brown_trigram[(corpus_size, smooth)].perplexity()))
    
    print() # new line

#Sentences ratio: 1.000 #Words ratio: 1.000
Brown corpus using 3-gram with 0 smoothed components. H: 1.114707 Perplexity: 2.165510
Brown corpus using 3-gram with 1 smoothed components. H: 3.416396 Perplexity: 10.676717
Brown corpus using 3-gram with 2 smoothed components. H: 7.542192 Perplexity: 186.391521

#Sentences ratio: 0.500 #Words ratio: 0.497
Brown corpus using 3-gram with 0 smoothed components. H: 0.903587 Perplexity: 1.870712
Brown corpus using 3-gram with 1 smoothed components. H: 2.981320 Perplexity: 7.897085
Brown corpus using 3-gram with 2 smoothed components. H: 7.140396 Perplexity: 141.082619

#Sentences ratio: 0.250 #Words ratio: 0.258
Brown corpus using 3-gram with 0 smoothed components. H: 0.792687 Perplexity: 1.732297
Brown corpus using 3-gram with 1 smoothed components. H: 2.655754 Perplexity: 6.301754
Brown corpus using 3-gram with 2 smoothed components. H: 6.632691 Perplexity: 99.229083

