In [1]:
# Lab Week 7 - Classification tasks with different types of features
# This file has small examples that are meant to be run individually
#   in the Python shell

import nltk

In [2]:
# define a feature extraction function for each name
def gender_features(word):
    return{'last_letter': word[-1]}

In [3]:
print(gender_features('Shrek'))

{'last_letter': 'k'}


In [4]:
# resource for male and female first names
from nltk.corpus import names
print(names.words('male.txt')[:20])
print(names.words('female.txt')[:20])

['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot', 'Abbott', 'Abby', 'Abdel', 'Abdul', 'Abdulkarim', 'Abdullah', 'Abe', 'Abel', 'Abelard', 'Abner', 'Abraham', 'Abram', 'Ace', 'Adair', 'Adam']
['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', 'Abby', 'Abigael', 'Abigail', 'Abigale', 'Abra', 'Acacia', 'Ada', 'Adah', 'Adaline', 'Adara', 'Addie', 'Addis', 'Adel', 'Adela']


In [5]:
# make list of male and female names paired with gender
namesgender = ([(name, 'male') for name in names.words('male.txt')] +
          [(name, 'female') for name in names.words('female.txt')])
print(len(namesgender))
print(namesgender[:20])
print(namesgender[7924:])

7944
[('Aamir', 'male'), ('Aaron', 'male'), ('Abbey', 'male'), ('Abbie', 'male'), ('Abbot', 'male'), ('Abbott', 'male'), ('Abby', 'male'), ('Abdel', 'male'), ('Abdul', 'male'), ('Abdulkarim', 'male'), ('Abdullah', 'male'), ('Abe', 'male'), ('Abel', 'male'), ('Abelard', 'male'), ('Abner', 'male'), ('Abraham', 'male'), ('Abram', 'male'), ('Ace', 'male'), ('Adair', 'male'), ('Adam', 'male')]
[('Zena', 'female'), ('Zenia', 'female'), ('Zia', 'female'), ('Zilvia', 'female'), ('Zita', 'female'), ('Zitella', 'female'), ('Zoe', 'female'), ('Zola', 'female'), ('Zonda', 'female'), ('Zondra', 'female'), ('Zonnya', 'female'), ('Zora', 'female'), ('Zorah', 'female'), ('Zorana', 'female'), ('Zorina', 'female'), ('Zorine', 'female'), ('Zsa Zsa', 'female'), ('Zsazsa', 'female'), ('Zulema', 'female'), ('Zuzana', 'female')]


In [6]:
# put the list into random order
import random
random.shuffle(namesgender)
print(namesgender[:20])

[('Aleda', 'female'), ('Sybyl', 'female'), ('Osborne', 'male'), ('Stephie', 'female'), ('Mehetabel', 'female'), ('Erda', 'female'), ('Sean', 'female'), ('Nathanil', 'male'), ('Analiese', 'female'), ('Stig', 'male'), ('Aurore', 'female'), ('Fancie', 'female'), ('Leonidas', 'male'), ('Jillayne', 'female'), ('Delphinia', 'female'), ('Ashish', 'male'), ('Cathrin', 'female'), ('Merna', 'female'), ('Kat', 'female'), ('Yule', 'male')]


In [7]:
# separate the names into training and test
train_names = namesgender[500:]
test_names = namesgender[:500]

In [8]:
# use our features to train a classify and test on the development test set
train_set = [(gender_features(n), g) for (n, g) in train_names]
test_set = [(gender_features(n), g) for (n, g) in test_names]
print(train_set[:20])

[({'last_letter': 'a'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'y'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'y'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'i'}, 'female'), ({'last_letter': 'n'}, 'male'), ({'last_letter': 'n'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'n'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 's'}, 'male'), ({'last_letter': 'l'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'l'}, 'female'), ({'last_letter': 'y'}, 'female')]


In [9]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [10]:
# classify new instances
print(classifier.classify(gender_features('Neo')))
print(classifier.classify(gender_features('Trinity')))

male
female


In [11]:
# classify accuracy function runs the classifier on the test set and reports
#   comparisons between predicted labels and actual/gold labels
print(nltk.classify.accuracy(classifier, test_set))

0.732


In [12]:
# this function available for naive bayes classifiers
print(classifier.show_most_informative_features(20))

Most Informative Features
             last_letter = 'a'            female : male   =     34.4 : 1.0
             last_letter = 'k'              male : female =     31.9 : 1.0
             last_letter = 'f'              male : female =     16.7 : 1.0
             last_letter = 'p'              male : female =     11.9 : 1.0
             last_letter = 'v'              male : female =     11.2 : 1.0
             last_letter = 'd'              male : female =      9.4 : 1.0
             last_letter = 'o'              male : female =      9.0 : 1.0
             last_letter = 'm'              male : female =      8.8 : 1.0
             last_letter = 'r'              male : female =      7.0 : 1.0
             last_letter = 'g'              male : female =      5.1 : 1.0
             last_letter = 'w'              male : female =      5.1 : 1.0
             last_letter = 'u'              male : female =      4.7 : 1.0
             last_letter = 's'              male : female =      4.3 : 1.0

In [13]:
# define a function that will compare the classifier labels with the gold standard labels
def geterrors(test):
    errors = []
    for (name, tag) in test:
        guess = classifier.classify(gender_features(name))
        if guess != tag:
            errors.append( (tag, guess, name) )
    return errors

In [14]:
errors = geterrors(test_names)
print(len(errors))

134


In [15]:
# define a function to print the errors
def printerrors(errors):
    for (tag, guess, name) in sorted(errors):
        print('correct={:<8s} guess={:<8s} name={:<30s}'.format(tag, guess, name))

In [16]:
printerrors(errors)

correct=female   guess=male     name=Adel                          
correct=female   guess=male     name=Alleen                        
correct=female   guess=male     name=Alys                          
correct=female   guess=male     name=Annabal                       
correct=female   guess=male     name=Anne-Mar                      
correct=female   guess=male     name=Ariel                         
correct=female   guess=male     name=Aurel                         
correct=female   guess=male     name=Beau                          
correct=female   guess=male     name=Bess                          
correct=female   guess=male     name=Brenn                         
correct=female   guess=male     name=Bryn                          
correct=female   guess=male     name=Brynn                         
correct=female   guess=male     name=Cathleen                      
correct=female   guess=male     name=Cathrin                       
correct=female   guess=male     name=Christal   

In [17]:
## classify part of speech based on sentence context
from nltk.corpus import brown

In [18]:
# define features for the "i"th word in the sentence, including three types of suffix 
#     and one pre-word
# the pos features function takes the sentence of untagged words and the index of a word i
#   it creates features for word i, including the previous word i-1
def pos_features(sentence, i):    
    features = {"suffix(1)": sentence[i][-1:],
		    "suffix(2)": sentence[i][-2:],
		    "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features 

In [19]:
# look at features of a specific word in a specific sentence
# first sentence of brown corpus
sentence0 = brown.sents()[0]
print(sentence0)
# word 8 of sentence 0
print(sentence0[8])

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
investigation


In [20]:
# pos features of the word 8 
print(pos_features(sentence0, 8))

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}


In [21]:
# get the POS tagged sentences with categories of news
tagged_sents = brown.tagged_sents(categories='news')
tag_sent0 = tagged_sents[0]
tag_sent0

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN'),
 ("Atlanta's", 'NP$'),
 ('recent', 'JJ'),
 ('primary', 'NN'),
 ('election', 'NN'),
 ('produced', 'VBD'),
 ('``', '``'),
 ('no', 'AT'),
 ('evidence', 'NN'),
 ("''", "''"),
 ('that', 'CS'),
 ('any', 'DTI'),
 ('irregularities', 'NNS'),
 ('took', 'VBD'),
 ('place', 'NN'),
 ('.', '.')]

In [22]:
# the function nltk.tag.untag will take the tags off
nltk.tag.untag(tag_sent0)

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of',
 "Atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.']

In [23]:
# the python enumerate function generates an index number for each item in a list
for i,(word,tag) in enumerate(tag_sent0):
    print (i, word, tag)

0 The AT
1 Fulton NP-TL
2 County NN-TL
3 Grand JJ-TL
4 Jury NN-TL
5 said VBD
6 Friday NR
7 an AT
8 investigation NN
9 of IN
10 Atlanta's NP$
11 recent JJ
12 primary NN
13 election NN
14 produced VBD
15 `` ``
16 no AT
17 evidence NN
18 '' ''
19 that CS
20 any DTI
21 irregularities NNS
22 took VBD
23 place NN
24 . .


In [24]:
# get feature sets of words appearing in the corpus, from untagged sentences.
# and then get their tags from corresponding tagged sentence
# use the Python function enumerate to pair the index numbers with sentence words 
#   for the pos features function
featuresets = []
for tagged_sent in tagged_sents:
	untagged_sent = nltk.tag.untag(tagged_sent)
	for i, (word, tag) in enumerate(tagged_sent):
		featuresets.append( (pos_features(untagged_sent, i), tag) )

In [25]:
# look at the feature sets of the first 10 words
for f in featuresets[:10]:
	print (f)

({'suffix(1)': 'e', 'suffix(2)': 'he', 'suffix(3)': 'The', 'prev-word': '<START>'}, 'AT')
({'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ton', 'prev-word': 'The'}, 'NP-TL')
({'suffix(1)': 'y', 'suffix(2)': 'ty', 'suffix(3)': 'nty', 'prev-word': 'Fulton'}, 'NN-TL')
({'suffix(1)': 'd', 'suffix(2)': 'nd', 'suffix(3)': 'and', 'prev-word': 'County'}, 'JJ-TL')
({'suffix(1)': 'y', 'suffix(2)': 'ry', 'suffix(3)': 'ury', 'prev-word': 'Grand'}, 'NN-TL')
({'suffix(1)': 'd', 'suffix(2)': 'id', 'suffix(3)': 'aid', 'prev-word': 'Jury'}, 'VBD')
({'suffix(1)': 'y', 'suffix(2)': 'ay', 'suffix(3)': 'day', 'prev-word': 'said'}, 'NR')
({'suffix(1)': 'n', 'suffix(2)': 'an', 'suffix(3)': 'an', 'prev-word': 'Friday'}, 'AT')
({'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}, 'NN')
({'suffix(1)': 'f', 'suffix(2)': 'of', 'suffix(3)': 'of', 'prev-word': 'investigation'}, 'IN')


In [26]:
	
# using naive Bayesian as classifier
# split data into a training set and a test set, using a 90%/10% split
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
print(len(train_set))
print(len(test_set))

90499
10055


In [27]:
# train classifier on the training set
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [28]:
# evaluate the accuracy (this will take a little while)
print(nltk.classify.accuracy(classifier, test_set))
# the result is reasonable for features without the previous tag

0.7891596220785678


In [29]:
### classify documents based on keywords
from nltk.corpus import movie_reviews
import random

In [30]:
# movie reviews are labeled either positive or negative (by human annotators)
print(movie_reviews.categories())

['neg', 'pos']


In [31]:
# for each document in movie_reviews, get its words and category (positive/negative)
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
print(len(documents))

2000


In [32]:
random.shuffle(documents)
# look at the first document - consists of a list of all the words in the review
# followed by the category
print(documents[0])

(['as', 'you', 'should', 'know', ',', 'this', 'summer', 'has', 'been', 'less', 'than', 'memorable', '.', 'with', 'a', 'total', 'of', '4', 'decent', 'films', ',', 'it', "'", 's', 'not', 'a', 'surprise', 'that', 'these', 'big', 'budget', 'failures', 'keep', 'appearing', '.', 'with', 'that', 'said', ',', 'you', 'can', 'pretty', 'much', 'predict', 'what', 'my', 'opinion', 'on', '"', 'the', '13th', 'warrior', '"', 'will', 'be', '.', 'the', 'film', 'is', 'based', 'on', 'the', 'michael', 'crichton', '"', 'eaters', 'of', 'the', 'dead', '"', ',', 'in', 'which', 'ahmed', 'ibn', 'fahdlan', 'is', 'banished', 'from', 'his', 'country', 'for', 'looking', 'at', 'a', 'wife', 'of', 'a', 'king', '.', 'after', 'tarveling', 'for', 'many', 'months', ',', 'he', 'comes', 'across', 'a', 'gang', 'of', 'norsemen', ',', 'who', 'are', 'forced', 'to', 'pick', '13', 'men', 'to', 'protect', 'a', 'town', 'from', 'mythical', 'monsters', 'who', 'travel', 'in', 'the', 'fog', '.', 'so', ',', 'they', 'start', 'picking', 'm

In [33]:
## use words from all documents to define the word vector for features
# get all words from all movie_reviews and put into a frequency distribution
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
print(len(all_words))

39768


In [34]:
# get the 2000 most frequently appearing keywords in the corpus
word_items = all_words.most_common(2000)
word_features = [word for (word, freq) in word_items]   # just the words

In [35]:
# look at the first 100 words
print(word_features[:100])

[',', 'the', '.', 'a', 'and', 'of', 'to', "'", 'is', 'in', 's', '"', 'it', 'that', '-', ')', '(', 'as', 'with', 'for', 'his', 'this', 'film', 'i', 'he', 'but', 'on', 'are', 't', 'by', 'be', 'one', 'movie', 'an', 'who', 'not', 'you', 'from', 'at', 'was', 'have', 'they', 'has', 'her', 'all', '?', 'there', 'like', 'so', 'out', 'about', 'up', 'more', 'what', 'when', 'which', 'or', 'she', 'their', ':', 'some', 'just', 'can', 'if', 'we', 'him', 'into', 'even', 'only', 'than', 'no', 'good', 'time', 'most', 'its', 'will', 'story', 'would', 'been', 'much', 'character', 'also', 'get', 'other', 'do', 'two', 'well', 'them', 'very', 'characters', ';', 'first', '--', 'after', 'see', '!', 'way', 'because', 'make', 'life']


In [36]:
# define features (keywords) of a document
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document
def document_features(document, word_features):
	document_words = set(document)
	features = {}
	for word in word_features:
		features['V_%s' % word] = (word in document_words)
	return features

In [37]:
# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d,c) in documents]

In [38]:
# the feature sets are 2000 words long - so this is optional
print(featuresets[0])

({'V_,': True, 'V_the': True, 'V_.': True, 'V_a': True, 'V_and': True, 'V_of': True, 'V_to': True, "V_'": True, 'V_is': True, 'V_in': True, 'V_s': True, 'V_"': True, 'V_it': True, 'V_that': True, 'V_-': False, 'V_)': False, 'V_(': False, 'V_as': True, 'V_with': True, 'V_for': True, 'V_his': True, 'V_this': True, 'V_film': True, 'V_i': True, 'V_he': True, 'V_but': True, 'V_on': True, 'V_are': True, 'V_t': True, 'V_by': False, 'V_be': True, 'V_one': False, 'V_movie': False, 'V_an': True, 'V_who': True, 'V_not': True, 'V_you': True, 'V_from': True, 'V_at': True, 'V_was': False, 'V_have': True, 'V_they': True, 'V_has': True, 'V_her': False, 'V_all': True, 'V_?': True, 'V_there': True, 'V_like': True, 'V_so': True, 'V_out': True, 'V_about': True, 'V_up': False, 'V_more': False, 'V_what': True, 'V_when': True, 'V_which': True, 'V_or': True, 'V_she': False, 'V_their': False, 'V_:': False, 'V_some': True, 'V_just': False, 'V_can': True, 'V_if': True, 'V_we': False, 'V_him': False, 'V_into': Tr

In [39]:
# training using naive Baysian classifier with a 95/5 split
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [40]:
# evaluate the accuracy of the classifier
print (nltk.classify.accuracy(classifier, test_set))
# the accuracy result may vary since we randomized the documents

0.84


In [41]:
# show which features of classifier are most informative
print(classifier.show_most_informative_features(30))


Most Informative Features
                V_seagal = True              neg : pos    =     13.9 : 1.0
           V_outstanding = True              pos : neg    =     11.2 : 1.0
           V_wonderfully = True              pos : neg    =      8.6 : 1.0
                 V_awful = True              neg : pos    =      5.8 : 1.0
                 V_damon = True              pos : neg    =      5.8 : 1.0
                  V_lame = True              neg : pos    =      5.7 : 1.0
                V_wasted = True              neg : pos    =      5.6 : 1.0
                 V_flynt = True              pos : neg    =      5.6 : 1.0
            V_ridiculous = True              neg : pos    =      5.4 : 1.0
                 V_waste = True              neg : pos    =      5.2 : 1.0
                  V_jedi = True              pos : neg    =      5.2 : 1.0
                V_poorly = True              neg : pos    =      5.0 : 1.0
                V_superb = True              pos : neg    =      4.6 : 1.0

# HW7 From here #

## 1) Gender ## 

### a) 2 features for gender

In [42]:
def gender_features2(word):
    return{'last_letter': word[-1],'secondlast_letter':word[-2]}


In [43]:
print(gender_features2('Shrek'))

{'last_letter': 'k', 'secondlast_letter': 'e'}


In [44]:
from nltk.corpus import names
namesgender = ([(name, 'male') for name in names.words('male.txt')] +
          [(name, 'female') for name in names.words('female.txt')])
# put the list into random order
import random
random.shuffle(namesgender)

In [45]:
# separate the names into training and test
train_names = namesgender[500:]
test_names = namesgender[:500]

In [46]:
# use 2 features to train a classify and test on the development test set
train_set2 = [(gender_features2(n), g) for (n, g) in train_names]
test_set2 = [(gender_features2(n), g) for (n, g) in test_names]

In [47]:
classifier2 = nltk.NaiveBayesClassifier.train(train_set2)

In [48]:
# classify new instances
print(classifier2.classify(gender_features('Neo')))
print(classifier2.classify(gender_features('Trinity')))

male
female


In [49]:
# classify accuracy function runs the classifier on the test set and reports
#   comparisons between predicted labels and actual/gold labels
print("Accuracy for 2 features classifier = ", nltk.classify.accuracy(classifier2, test_set2))

Accuracy for 2 features classifier =  0.802


### b) 3 Features for gender


In [50]:
#for 3 features
#The following 2 lines gives an error as there are values with length =2 so we cajnt get 3 features out of them.

#train_set3 = [(gender_features3(n), g) for (n, g) in train_names]
#test_set3 = [(gender_features3(n), g) for (n, g) in test_names]

In [51]:
def gender_features3(word):
    return{'last_letter': word[-1],'secondlast_letter':word[-2],'thirdlast_letter':word[-3]}

print(gender_features3('Shrek'))

{'last_letter': 'k', 'secondlast_letter': 'e', 'thirdlast_letter': 'r'}


In [52]:
lengths =[len(x[0])for x in namesgender] #getting length of each name
lengths.sort() # sorting the lengths

#Getting frequency of each name length
f = {}
for i in lengths:
    if i in f:
        f[i] += 1
    else:
        f[i] = 1

print(f)

{2: 19, 3: 272, 4: 926, 5: 1878, 6: 2049, 7: 1447, 8: 846, 9: 351, 10: 116, 11: 24, 12: 10, 13: 3, 14: 1, 15: 2}


In [53]:
#List of names with length 2
names_len2 = [a for a in namesgender if len(a[0])==2]
names_len2

[('Hy', 'male'),
 ('Jo', 'female'),
 ('Jo', 'male'),
 ('Em', 'female'),
 ('Bo', 'male'),
 ('Di', 'female'),
 ('Cy', 'male'),
 ('Er', 'male'),
 ('Ev', 'male'),
 ('Ez', 'male'),
 ('Si', 'male'),
 ('Bo', 'female'),
 ('La', 'female'),
 ('Ki', 'female'),
 ('Ag', 'female'),
 ('Ed', 'male'),
 ('Al', 'male'),
 ('Vi', 'female'),
 ('Ty', 'male')]

In [54]:
# If we get rid of these names, our feature engineering for last 3 letters will work

#removing len 2 names
namesgender3 = [x for x in namesgender if len(x[0])>2]
lengths3 =[len(x[0])for x in namesgender3] #getting length of each name
lengths3.sort() # sorting the lengths

#Getting frequency of each name length
f3 = {}
for i3 in lengths3:
    if i3 in f3:
        f3[i3] += 1
    else:
        f3[i3] = 1

print(f3)

# We now build the training and testing set using gender features on the new names set
# separate the names into training and test
train_names3 = namesgender3[500:]
test_names3 = namesgender3[:500]
train_set3 = [(gender_features3(n), g) for (n, g) in train_names3]
test_set3 = [(gender_features3(n), g) for (n, g) in test_names3]


{3: 272, 4: 926, 5: 1878, 6: 2049, 7: 1447, 8: 846, 9: 351, 10: 116, 11: 24, 12: 10, 13: 3, 14: 1, 15: 2}


In [55]:
#Atleast 3 letter names
classifier3 = nltk.NaiveBayesClassifier.train(train_set3)

In [56]:
# classify new instances with 3 feature classifier
print(classifier3.classify(gender_features('Neo')))
print(classifier3.classify(gender_features('Trinity')))

male
female


In [57]:
print("Accuracy for 3 features classifier = ", nltk.classify.accuracy(classifier3, test_set3))

Accuracy for 3 features classifier =  0.78


## 2) Movies ## 

In [77]:
### classify documents based on keywords
from nltk.corpus import movie_reviews
import random

In [78]:
# movie reviews are labeled either positive or negative (by human annotators)
print(movie_reviews.categories())

['neg', 'pos']


In [79]:
# for each document in movie_reviews, get its words and category (positive/negative)
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
print(len(documents))

2000


In [80]:
random.shuffle(documents)


In [81]:
## use words from all documents to define the word vector for features
# get all words from all movie_reviews and put into a frequency distribution
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
print(len(all_words))

39768


In [82]:
# get the 2000 most frequently appearing keywords in the corpus
word_items_less2k = all_words.most_common(1350)
word_features_less2k = [word for (word, freq) in word_items_less2k]   # just the words

word_items_more2k = all_words.most_common(5050)
word_features_more2k = [word for (word, freq) in word_items_more2k]

In [83]:
# look at the first 100 words
print("less than 2K : ",word_features_less2k[:100],"\n")
print("More than 2K : ",word_features_more2k[:100])

less than 2K :  [',', 'the', '.', 'a', 'and', 'of', 'to', "'", 'is', 'in', 's', '"', 'it', 'that', '-', ')', '(', 'as', 'with', 'for', 'his', 'this', 'film', 'i', 'he', 'but', 'on', 'are', 't', 'by', 'be', 'one', 'movie', 'an', 'who', 'not', 'you', 'from', 'at', 'was', 'have', 'they', 'has', 'her', 'all', '?', 'there', 'like', 'so', 'out', 'about', 'up', 'more', 'what', 'when', 'which', 'or', 'she', 'their', ':', 'some', 'just', 'can', 'if', 'we', 'him', 'into', 'even', 'only', 'than', 'no', 'good', 'time', 'most', 'its', 'will', 'story', 'would', 'been', 'much', 'character', 'also', 'get', 'other', 'do', 'two', 'well', 'them', 'very', 'characters', ';', 'first', '--', 'after', 'see', '!', 'way', 'because', 'make', 'life'] 

More than 2K :  [',', 'the', '.', 'a', 'and', 'of', 'to', "'", 'is', 'in', 's', '"', 'it', 'that', '-', ')', '(', 'as', 'with', 'for', 'his', 'this', 'film', 'i', 'he', 'but', 'on', 'are', 't', 'by', 'be', 'one', 'movie', 'an', 'who', 'not', 'you', 'from', 'at', 'w

In [84]:
# define features (keywords) of a document
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document
def document_features(document, word_features):
	document_words = set(document)
	features = {}
	for word in word_features:
		features['V_%s' % word] = (word in document_words)
	return features

In [85]:
# get features sets for a document, including keyword features and category feature
featuresets_less2k = [(document_features(d, word_features_less2k), c) for (d,c) in documents]


In [89]:
# training using naive Baysian classifier with a 95/5 split
train_set_less2k, test_set_less2k = featuresets_less2k[100:], featuresets_less2k[:100]
classifier_less2k = nltk.NaiveBayesClassifier.train(train_set_less2k)

# evaluate the accuracy of the classifier
print (nltk.classify.accuracy(classifier_less2k, test_set_less2k))
# the accuracy result may vary since we randomized the documents

0.73


In [87]:
# get features sets for a document, including keyword features and category feature
featuresets_more2k = [(document_features(d,word_features_more2k), c) for (d,c) in documents]


In [90]:
# training using naive Baysian classifier with a 95/5 split
train_set_more2k, test_set_more2k = featuresets_more2k[100:], featuresets_more2k[:100]
classifier_more2k = nltk.NaiveBayesClassifier.train(train_set_more2k)

# evaluate the accuracy of the classifier
print (nltk.classify.accuracy(classifier_more2k, test_set_more2k))
# the accuracy result may vary since we randomized the documents

0.79
