In [2]:
import nltk
# movie review sentences
from nltk.corpus import sentence_polarity
import random

In [3]:
## repeat the setup of the movie review sentences for classification
# for each sentence(document), get its words and category (positive/negative)
documents = [(sent, cat) for cat in sentence_polarity.categories() 
    for sent in sentence_polarity.sents(categories=cat)]
random.shuffle(documents)

In [4]:
# get all words from all movie_reviews and put into a frequency distribution
#   note lowercase, but no stemming or stopwords
all_words_list = [word for (sent,cat) in documents for word in sent]
all_words = nltk.FreqDist(all_words_list)
print(len(all_words))

21401


In [5]:
# get the 1500 most frequently appearing keywords in the corpus
word_items = all_words.most_common(1500)
word_features = [word for (word,count) in word_items]

In [6]:
# define features (keywords) of a document for a BOW/unigram baseline
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [7]:
# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d, c) in documents]

# training using naive Baysian classifier, training set is 90% of data
train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)

# evaluate the accuracy of the classifier
nltk.classify.accuracy(classifier, test_set)

0.754

In [10]:
print(featuresets[0])

({'contains(.)': True, 'contains(the)': True, 'contains(,)': True, 'contains(a)': False, 'contains(and)': True, 'contains(of)': True, 'contains(to)': False, 'contains(is)': False, 'contains(in)': False, 'contains(that)': False, 'contains(it)': False, 'contains(as)': False, 'contains(but)': False, 'contains(with)': False, 'contains(film)': False, 'contains(this)': False, 'contains(for)': False, 'contains(its)': False, 'contains(an)': True, 'contains(movie)': False, "contains(it's)": False, 'contains(be)': False, 'contains(on)': False, 'contains(you)': False, 'contains(not)': False, 'contains(by)': False, 'contains(about)': False, 'contains(more)': False, 'contains(one)': False, 'contains(like)': False, 'contains(has)': False, 'contains(are)': False, 'contains(at)': False, 'contains(from)': False, 'contains(than)': False, 'contains(")': False, 'contains(all)': False, 'contains(--)': False, 'contains(his)': False, 'contains(have)': False, 'contains(so)': False, 'contains(if)': False, 'con

In [11]:
####   adding Bigram features   ####
# set up for using bigrams
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [12]:
# create the bigram finder on all the words in sequence
print(all_words_list[:50])
finder = BigramCollocationFinder.from_words(all_words_list)

['an', 'amalgam', 'of', 'the', 'fugitive', ',', 'blade', 'runner', ',', 'and', 'total', 'recall', ',', 'only', 'without', 'much', 'energy', 'or', 'tension', '.', 'if', 'the', 'idea', 'of', 'the', 'white', 'man', 'arriving', 'on', 'foreign', 'shores', 'to', 'show', 'wary', 'natives', 'the', 'true', 'light', 'is', 'abhorrent', 'to', 'you', ',', 'the', 'simplistic', 'heaven', 'will', 'quite', 'likely', 'be']


In [13]:
# define the top 500 bigrams using the chi squared measure
bigram_features = finder.nbest(bigram_measures.raw_freq, 500)
print(bigram_features[:50])

[('.', '.'), ('.', 'the'), ('.', 'a'), ('of', 'the'), (',', 'but'), (',', 'and'), ('in', 'the'), ('the', 'film'), ('is', 'a'), ('.', "it's"), (',', 'the'), ('of', 'a'), ('to', 'the'), ('and', 'the'), ('to', 'be'), ('the', 'movie'), ('.', 'this'), ('.', 'it'), ('for', 'the'), ('it', 'is'), ('.', 'an'), ('with', 'a'), ('as', 'a'), ('in', 'a'), ('on', 'the'), ('one', 'of'), ('and', 'a'), ('this', 'is'), ('a', 'movie'), ("it's", 'a'), (',', 'it'), ('.', 'if'), ('with', 'the'), ('film', 'is'), ('like', 'a'), (',', 'a'), ('for', 'a'), ('it', '.'), ('the', 'most'), ('.', 'but'), ('film', '.'), ('but', 'it'), ('of', 'its'), ('movie', '.'), (',', "it's"), (',', 'this'), ('a', 'film'), ('as', 'the'), ('.', 'i'), ('from', 'the')]


In [14]:
# examples to demonstrate the bigram feature function definition
sent = ['Arthur','carefully','rode','the','brown','horse','around','the','castle']
sentbigrams = list(nltk.bigrams(sent))
print(sentbigrams)

[('Arthur', 'carefully'), ('carefully', 'rode'), ('rode', 'the'), ('the', 'brown'), ('brown', 'horse'), ('horse', 'around'), ('around', 'the'), ('the', 'castle')]


In [13]:
# for a single bigram, test if it's in the sentence bigrams and format the feature name
bigram = ('brown','horse')
print(bigram in sentbigrams)
print('bigram({} {})'.format(bigram[0], bigram[1]))

True
bigram(brown horse)


In [17]:
# define features that include words as before 
# add the most frequent significant bigrams
# this function takes the list of words in a document as an argument and returns a feature dictionary
# it depends on the variables word_features and bigram_features
def bigram_document_features(document, word_features, bigram_features):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['bigram({} {})'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)    
    return features

In [18]:
# use this function to create feature sets for all sentences
bigram_featuresets = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in documents]

# number of features for document 0
print(len(bigram_featuresets[0][0].keys()))

# features in document 0
print(bigram_featuresets[0][0])

2000
{'contains(.)': True, 'contains(the)': True, 'contains(,)': True, 'contains(a)': False, 'contains(and)': True, 'contains(of)': True, 'contains(to)': False, 'contains(is)': False, 'contains(in)': False, 'contains(that)': False, 'contains(it)': False, 'contains(as)': False, 'contains(but)': False, 'contains(with)': False, 'contains(film)': False, 'contains(this)': False, 'contains(for)': False, 'contains(its)': False, 'contains(an)': True, 'contains(movie)': False, "contains(it's)": False, 'contains(be)': False, 'contains(on)': False, 'contains(you)': False, 'contains(not)': False, 'contains(by)': False, 'contains(about)': False, 'contains(more)': False, 'contains(one)': False, 'contains(like)': False, 'contains(has)': False, 'contains(are)': False, 'contains(at)': False, 'contains(from)': False, 'contains(than)': False, 'contains(")': False, 'contains(all)': False, 'contains(--)': False, 'contains(his)': False, 'contains(have)': False, 'contains(so)': False, 'contains(if)': False, 

In [19]:
# train a classifier and report accuracy
train_set, test_set = bigram_featuresets[1000:], bigram_featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.754

In [20]:
###  POS (part-of-speech) tag counts
# using the default pos tagger in NLTK (the Stanford tagger)
print(sent)
print(nltk.pos_tag(sent))

['Arthur', 'carefully', 'rode', 'the', 'brown', 'horse', 'around', 'the', 'castle']
[('Arthur', 'NNP'), ('carefully', 'RB'), ('rode', 'VBD'), ('the', 'DT'), ('brown', 'JJ'), ('horse', 'NN'), ('around', 'IN'), ('the', 'DT'), ('castle', 'NN')]


In [21]:
# this function takes a document list of words and returns a feature dictionary
# it runs the default pos tagger (the Stanford tagger) on the document
#   and counts 4 types of pos tags to use as features
def POS_features(document, word_features):
    document_words = set(document)
    tagged_words = nltk.pos_tag(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    numNoun = 0
    numVerb = 0
    numAdj = 0
    numAdverb = 0
    for (word, tag) in tagged_words:
        if tag.startswith('N'): numNoun += 1
        if tag.startswith('V'): numVerb += 1
        if tag.startswith('J'): numAdj += 1
        if tag.startswith('R'): numAdverb += 1
    features['nouns'] = numNoun
    features['verbs'] = numVerb
    features['adjectives'] = numAdj
    features['adverbs'] = numAdverb
    return features

In [22]:
# define feature sets using this function
POS_featuresets = [(POS_features(d, word_features), c) for (d, c) in documents]
# number of features for document 0
print(len(POS_featuresets[0][0].keys()))

1504


In [23]:
# the first sentence
print(documents[0])
# the pos tag features for this sentence
print('num nouns', POS_featuresets[0][0]['nouns'])
print('num verbs', POS_featuresets[0][0]['verbs'])
print('num adjectives', POS_featuresets[0][0]['adjectives'])
print('num adverbs', POS_featuresets[0][0]['adverbs'])

(['an', 'amalgam', 'of', 'the', 'fugitive', ',', 'blade', 'runner', ',', 'and', 'total', 'recall', ',', 'only', 'without', 'much', 'energy', 'or', 'tension', '.'], 'neg')
num nouns 5
num verbs 0
num adjectives 4
num adverbs 1


In [24]:
# train and test the classifier
train_set, test_set = POS_featuresets[1000:], POS_featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.748

In [15]:
## cross-validation ##
# this function takes the number of folds, the feature sets
# it iterates over the folds, using different sections for training and testing in turn
#   it prints the accuracy for each fold and the average accuracy at the end
def cross_validation_accuracy(num_folds, featuresets):
    subset_size = int(len(featuresets)/num_folds)
    accuracy_list = []
    # iterate over the folds
    for i in range(num_folds):
        test_this_round = featuresets[(i*subset_size):][:subset_size]
        train_this_round = featuresets[:(i*subset_size)] + featuresets[((i+1)*subset_size):]
        # train using train_this_round
        classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        # evaluate against test_this_round and save accuracy
        accuracy_this_round = nltk.classify.accuracy(classifier, test_this_round)
        print (i, accuracy_this_round)
        accuracy_list.append(accuracy_this_round)
    # find mean accuracy over all rounds
    print ('mean accuracy', sum(accuracy_list) / num_folds)

In [29]:
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds = 5
cross_validation_accuracy(num_folds, featuresets)

0 0.7401500938086304
1 0.7382739212007504
2 0.7335834896810507
3 0.7237335834896811
4 0.7387429643527205
mean accuracy 0.7348968105065666


## Exercise  
In lab, we ran the cross_validation function on the variable feature_sets, which gave
the average accuracy for the word feature sets. For your exercise, run the crossvalidation
function for the bigram feature sets and for the POS feature sets.



If you have time, run at least one of them for 10 folds instead of 5.


For each type of feature function definition, [ost into the discussion forum the original
accuracy that we got with just one train/test split and compare it with cross-validation
accuracy that you get for 5 folds. If you have time to run 10 fold cross-validation(s),
post that as well.

In [27]:
#run the cross-validation function for the bigram feature sets 
num_folds=5
print(cross_validation_accuracy(num_folds, bigram_featuresets))



0 0.7396810506566605
1 0.7387429643527205
2 0.7340525328330206
3 0.723264540337711
4 0.7387429643527205
mean accuracy 0.7348968105065665
None


In [28]:
#run the cross-validation function for for the POS feature sets.
print(cross_validation_accuracy(num_folds, POS_featuresets))


0 0.7340525328330206
1 0.7373358348968105
2 0.7387429643527205
3 0.725609756097561
4 0.7359287054409006
mean accuracy 0.7343339587242027
None


In [30]:
#run 10 folds CV on both
num_folds=10
#run the cross-validation function for the bigram feature sets 
print(cross_validation_accuracy(num_folds, bigram_featuresets))


0 0.7495309568480301
1 0.7317073170731707
2 0.7345215759849906
3 0.7467166979362101
4 0.7392120075046904
5 0.7382739212007504
6 0.7467166979362101
7 0.7045028142589118
8 0.7448405253283302
9 0.7354596622889306
mean accuracy 0.7371482176360226
None


In [None]:
#run the cross-validation function for for the POS feature sets.
print(cross_validation_accuracy(num_folds, POS_featuresets))

