# Simple Sentiment Classification 


In [38]:
## Sentiment Classification on Sentences from the Movie Reviews
## import some packages
import nltk
from nltk.corpus import sentence_polarity
import random
import numpy as np
import liwc

In [39]:
# get the sentence corpus and look at some sentences, sentences are already tokenized
sentences = sentence_polarity.sents()

In [40]:
# look at the sentences by category to see how many positive and negative
pos_sents = sentence_polarity.sents(categories='pos')
print(len(pos_sents))
neg_sents = sentence_polarity.sents(categories='neg')
print(len(neg_sents))

5331
5331


In [41]:
## setup the movie reviews sentences for classification
# create a list of documents, each document is one sentence as a list of words paired with category
documents = [(sent, cat) for cat in sentence_polarity.categories() 
	for sent in sentence_polarity.sents(categories=cat)]

#each item of documents is a tuple (list of tokens, neg/pos)

In [42]:
# get all words from all movie_reviews and put into a frequency distribution
#  note lowercase, but no stemming or stopwords
all_words_list = [word for (sent,cat) in documents for word in sent]
all_words = nltk.FreqDist(all_words_list)
# get the 2000 most frequently appearing keywords in the corpus
word_items = all_words.most_common(2000)
word_features = [word for (word,count) in word_items]
#word_features=[word for (word,count) in all_words]
print(word_features[0:50])
#print(word_items[0:50])

['.', 'the', ',', 'a', 'and', 'of', 'to', 'is', 'in', 'that', 'it', 'as', 'but', 'with', 'film', 'this', 'for', 'its', 'an', 'movie', "it's", 'be', 'on', 'you', 'not', 'by', 'about', 'more', 'one', 'like', 'has', 'are', 'at', 'from', 'than', '"', 'all', '--', 'his', 'have', 'so', 'if', 'or', 'story', 'i', 'too', 'just', 'who', 'into', 'what']


## Unigram Featureset (baseline)
Now we can define the features for each document, using just the words, sometimes
called the BOW or unigram features. The feature label will be ‘contains(keyword)’
for each keyword (aka word) in the word_features set, and the value of the feature
will be Boolean, according to whether the word is contained in that document.


In [43]:
# define features (keywords) of a document for a BOW/unigram baseline
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [44]:
# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d, c) in documents]
# the feature sets are 2000 words long so you may not want to look at one
#featuresets[0]


### Cross validation

In [12]:
## cross-validation
# this function takes the number of folds, the feature sets
# it iterates over the folds, using different sections for training and testing in turn
#   it prints the accuracy for each fold and the average accuracy at the end
def cross_validation_accuracy(num_folds, featuresets):
    subset_size = int(len(featuresets)/num_folds)
    accuracy_list = []
    # iterate over the folds
    for i in range(num_folds):
        test_this_round = featuresets[(i*subset_size):][:subset_size]
        train_this_round = featuresets[:(i*subset_size)] + featuresets[((i+1)*subset_size):]
        # train using train_this_round
        classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        # evaluate against test_this_round and save accuracy
        accuracy_this_round = nltk.classify.accuracy(classifier, test_this_round)
        print (i, accuracy_this_round)
        accuracy_list.append(accuracy_this_round)
    # find mean accuracy over all rounds
    print ('mean accuracy', sum(accuracy_list) / num_folds)

In [53]:
# training using naive Baysian classifier, training set is approximately 90% of data
random.shuffle(featuresets)
train_set, test_set = featuresets[1000:], featuresets[:1000]
cross_validation_accuracy(5,featuresets)

0 0.7514071294559099
1 0.75
2 0.7420262664165104
3 0.7434333958724203
4 0.7424953095684803
mean accuracy 0.7458724202626641


In [45]:
random.shuffle(featuresets)
train_set, test_set = featuresets[1000:], featuresets[:1000]
cross_validation_accuracy(5,featuresets)

0 0.75093808630394
1 0.7415572232645403
2 0.75
3 0.7471857410881801
4 0.7415572232645403
mean accuracy 0.7462476547842402


In [46]:
random.shuffle(featuresets)
train_set, test_set = featuresets[1000:], featuresets[:1000]
cross_validation_accuracy(5,featuresets)

0 0.7307692307692307
1 0.7406191369606003
2 0.7645403377110694
3 0.7528142589118199
4 0.7495309568480301
mean accuracy 0.7476547842401502


In [56]:
(0.7458724202626641+0.7462476547842402+0.7476547842401502)/3

0.7465916197623516

## Bigrams + unigram Features

In [8]:
#adding Bigram features
#set up for using bigrams
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
# create the bigram finder on all the words in sequence
print(all_words_list[:50])
finder = BigramCollocationFinder.from_words(all_words_list)
# define the top 500 bigrams using the chi squared measure
bigram_features = finder.nbest(bigram_measures.raw_freq, 500)
print(bigram_features[:50])
# examples to demonstrate the bigram feature function definition
#sent = ['Arthur','carefully','rode','the','brown','horse','around','the','castle']
#sentbigrams = list(nltk.bigrams(sent))
#print(sentbigrams)
# for a single bigram, test if it's in the sentence bigrams and format the feature name
#bigram = ('brown','horse')
#print(bigram in sentbigrams)
#print('bigram({} {})'.format(bigram[0], bigram[1]))

['simplistic', ',', 'silly', 'and', 'tedious', '.', "it's", 'so', 'laddish', 'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find', 'it', 'funny', '.', 'exploitative', 'and', 'largely', 'devoid', 'of', 'the', 'depth', 'or', 'sophistication', 'that', 'would', 'make', 'watching', 'such', 'a', 'graphic', 'treatment', 'of', 'the', 'crimes', 'bearable', '.', '[garbus]', 'discards', 'the', 'potential', 'for', 'pathological', 'study']
[('.', '.'), ('.', 'the'), ('.', 'a'), ('of', 'the'), (',', 'but'), (',', 'and'), ('in', 'the'), ('the', 'film'), ('is', 'a'), ('.', "it's"), (',', 'the'), ('of', 'a'), ('to', 'the'), ('and', 'the'), ('to', 'be'), ('the', 'movie'), ('.', 'this'), ('.', 'it'), ('for', 'the'), ('it', 'is'), ('with', 'a'), ('.', 'an'), ('as', 'a'), ('in', 'a'), ('on', 'the'), ('one', 'of'), ('and', 'a'), ('this', 'is'), ('a', 'movie'), ("it's", 'a'), (',', 'it'), ('.', 'if'), ('with', 'the'), ('film', 'is'), ('like', 'a'), ('it', '.'), (',', 'a'), ('for', '

In [10]:
# define features that include words as before 
# add the most frequent significant bigrams
# this function takes the list of words in a document as an argument and returns a feature dictionary
# it depends on the variables word_features and bigram_features
def bigram_document_features(document, word_features, bigram_features):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['bigram({} {})'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)    
    return features

# use this function to create feature sets for all sentences
bigram_featuresets = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in documents]
#There should be 2500 features: 2000 word features and 500 bigram features
len(bigram_featuresets[0][0].keys())

2500

In [32]:
len(bigram_featuresets)

10662

### Cross-Validation

In [13]:
# train a classifier and report accuracy
random.shuffle(bigram_featuresets)
train_set, test_set = bigram_featuresets[1000:], bigram_featuresets[:1000]
# perform the cross-validation on the featuresets with word features and generate accuracy
cross_validation_accuracy(5, bigram_featuresets)

0 0.7467166979362101
1 0.74906191369606
2 0.7401500938086304
3 0.74906191369606
4 0.7560975609756098
mean accuracy 0.7482176360225141


In [14]:
random.shuffle(bigram_featuresets)
train_set, test_set = bigram_featuresets[1000:], bigram_featuresets[:1000]
cross_validation_accuracy(5, bigram_featuresets)

0 0.7462476547842402
1 0.7406191369606003
2 0.7363977485928705
3 0.7410881801125704
4 0.7485928705440901
mean accuracy 0.7425891181988743


In [15]:
random.shuffle(bigram_featuresets)
train_set, test_set = bigram_featuresets[1000:], bigram_featuresets[:1000]
cross_validation_accuracy(5, bigram_featuresets)

0 0.7293621013133208
1 0.7457786116322702
2 0.7626641651031895
3 0.7542213883677298
4 0.74812382739212
mean accuracy 0.748030018761726


In [50]:
(0.7482176360225141+0.7425891181988743+0.748030018761726)/3

0.7462789243277048

##  POS features+ Bigram features + unigram features

In [16]:
###  POS (part-of-speech) tag counts
# this function takes a document list of words and returns a feature dictionary
# it runs the default pos tagger (the Stanford tagger) on the document
#   and counts 4 types of pos tags to use as features
def POS_features(document, word_features, bigram_features):
    document_words = set(document)
    tagged_words = nltk.pos_tag(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['bigram({} {})'.format(bigram[0], bigram[1])] = (bigram in document_bigrams) 
    numNoun = 0
    numVerb = 0
    numAdj = 0
    numAdverb = 0
    for (word, tag) in tagged_words:
        if tag.startswith('N'): numNoun += 1
        if tag.startswith('V'): numVerb += 1
        if tag.startswith('J'): numAdj += 1
        if tag.startswith('R'): numAdverb += 1
    features['nouns'] = numNoun
    features['verbs'] = numVerb
    features['adjectives'] = numAdj
    features['adverbs'] = numAdverb
    return features

# define feature sets using this function
POS_featuresets = [(POS_features(d, word_features,bigram_features), c) for (d, c) in documents]

# number of features for document 0
len(POS_featuresets[0][0].keys())

2504

In [21]:
print(documents[0])

(['simplistic', ',', 'silly', 'and', 'tedious', '.'], 'neg')


In [23]:
print('num nouns', POS_featuresets[0][0]['nouns'])
print('num verbs', POS_featuresets[0][0]['verbs'])
print('num adjectives', POS_featuresets[0][0]['adjectives'])
print('num adverbs', POS_featuresets[0][0]['adverbs'])

num nouns 0
num verbs 0
num adjectives 2
num adverbs 1


In [34]:
len(POS_featuresets)

10662

### Cross-Validation

In [24]:
# perform the cross-validation on the featuresets with word features and generate accuracy
random.shuffle(POS_featuresets)
train_set, test_set = POS_featuresets[1000:], POS_featuresets[:1000]
cross_validation_accuracy(5, POS_featuresets)

0 0.7476547842401501
1 0.7420262664165104
2 0.7424953095684803
3 0.7471857410881801
4 0.724202626641651
mean accuracy 0.7407129455909944


In [105]:
random.shuffle(POS_featuresets)
train_set, test_set = POS_featuresets[1000:], POS_featuresets[:1000]
cross_validation_accuracy(5, POS_featuresets)

0 0.7467166979362101
1 0.7476547842401501
2 0.7303001876172608
3 0.7424953095684803
4 0.74812382739212
mean accuracy 0.7430581613508442


In [106]:
random.shuffle(POS_featuresets)
train_set, test_set = POS_featuresets[1000:], POS_featuresets[:1000]
cross_validation_accuracy(5, POS_featuresets)

0 0.7359287054409006
1 0.7514071294559099
2 0.7565666041275797
3 0.7387429643527205
4 0.7326454033771107
mean accuracy 0.7430581613508443


In [51]:
(0.7407129455909944+0.7430581613508442+0.7430581613508443)/3

0.7422764227642277

## liwc 


In [25]:
def read_words():
  poslist = []
  neglist = []
  flexicon = open('SentimentLexicons/liwcdic2007.dic', encoding='latin1')
  # read all LIWC words from file
  wordlines = [line.strip() for line in flexicon]
  # each line has a word or a stem followed by * and numbers of the word classes it is in
  # word class 126 is positive emotion and 127 is negative emotion
  for line in wordlines:
    if not line == '':
      items = line.split()
      word = items[0]
      classes = items[1:]
      for c in classes:
        if c == '126':
          poslist.append( word )
        if c == '127':
          neglist.append( word )
  return (poslist, neglist)

poslist, neglist = read_words()

def liwc_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    # count variables for the 2 classes of subjectivity
    Pos = 0
    Neg = 0
    for word in document_words:
        if word in poslist:
                Pos += 1
        if word in neglist:
                Neg += 1
        features['poscount'] = Pos
        features['negcount'] = Neg    
    return features

liwc_featureset= [(liwc_features(d,word_features), c) for (d, c) in documents]
print(len(liwc_featureset))
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document
print(liwc_featureset[1][0])

10662
{'contains(.)': True, 'contains(the)': False, 'contains(,)': True, 'contains(a)': False, 'contains(and)': True, 'contains(of)': False, 'contains(to)': False, 'contains(is)': False, 'contains(in)': False, 'contains(that)': False, 'contains(it)': True, 'contains(as)': False, 'contains(but)': False, 'contains(with)': False, 'contains(film)': False, 'contains(this)': False, 'contains(for)': False, 'contains(its)': False, 'contains(an)': False, 'contains(movie)': False, "contains(it's)": True, 'contains(be)': False, 'contains(on)': False, 'contains(you)': False, 'contains(not)': False, 'contains(by)': False, 'contains(about)': False, 'contains(more)': False, 'contains(one)': False, 'contains(like)': False, 'contains(has)': False, 'contains(are)': False, 'contains(at)': False, 'contains(from)': False, 'contains(than)': False, 'contains(")': False, 'contains(all)': False, 'contains(--)': False, 'contains(his)': False, 'contains(have)': False, 'contains(so)': True, 'contains(if)': False,

### Cross-validation

In [113]:
random.shuffle(liwc_featureset)
train_set, test_set = liwc_featureset[1000:], liwc_featureset[:1000]
cross_validation_accuracy(5, liwc_featureset)

0 0.75
1 0.7560975609756098
2 0.7467166979362101
3 0.74812382739212
4 0.7340525328330206
mean accuracy 0.7469981238273921


In [47]:
random.shuffle(liwc_featureset)
train_set, test_set = liwc_featureset[1000:], liwc_featureset[:1000]
cross_validation_accuracy(5, liwc_featureset)

0 0.7570356472795498
1 0.7448405253283302
2 0.7439024390243902
3 0.7392120075046904
4 0.7382739212007504
mean accuracy 0.7446529080675421


In [48]:
random.shuffle(liwc_featureset)
train_set, test_set = liwc_featureset[1000:], liwc_featureset[:1000]
cross_validation_accuracy(5, liwc_featureset)

0 0.7387429643527205
1 0.7401500938086304
2 0.7593808630393997
3 0.7453095684803002
4 0.7467166979362101
mean accuracy 0.7460600375234522


In [52]:
(0.7469981238273921+0.7446529080675421+0.7460600375234522)/3

0.7459036898061289

## Sentiment Lexicon

In [27]:
####   adding features from a sentiment lexicon   ####
# First look at the program in the file Subjectivity.py to load the subjectivity lexicon
# copy and paste the definition of the readSubjectivity function

# This function returns a dictionary where you can look up words and get back 
#     the four items of subjectivity information described above
def readSubjectivity(path):
    flexicon = open(path, 'r')
    # initialize an empty dictionary
    sldict = { }
    for line in flexicon:
        fields = line.split()   # default is to split on whitespace
        # split each field on the '=' and keep the second part as the value
        strength = fields[0].split("=")[1]
        word = fields[2].split("=")[1]
        posTag = fields[3].split("=")[1]
        stemmed = fields[4].split("=")[1]
        polarity = fields[5].split("=")[1]
        if (stemmed == 'y'):
            isStemmed = True
        else:
            isStemmed = False
        # put a dictionary entry with the word as the keyword
        #     and a list of the other values
        sldict[word] = [strength, posTag, isStemmed, polarity]
    return sldict

# create a path to where the subjectivity file resides on your disk

SLpath = "subjclueslen1-HLTEMNLP05.tff"
SL = readSubjectivity(SLpath)

# how many words are in the dictionary:6885
#len(SL.keys())

SL.keys()
SL['abandoned']

['weaksubj', 'adj', False, 'negative']

In [28]:
# define features that include word counts of subjectivity words
# negative feature will have number of weakly negative words +
#    2 * number of strongly negative words
# positive feature has similar definition
#    not counting neutral words
def SL_features(document, word_features, SL):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    # count variables for the 4 classes of subjectivity
    weakPos = 0
    strongPos = 0
    weakNeg = 0
    strongNeg = 0
    for word in document_words:
        if word in SL:
            strength, posTag, isStemmed, polarity = SL[word]
            if strength == 'weaksubj' and polarity == 'positive':
                weakPos += 1
            if strength == 'strongsubj' and polarity == 'positive':
                strongPos += 1
            if strength == 'weaksubj' and polarity == 'negative':
                weakNeg += 1
            if strength == 'strongsubj' and polarity == 'negative':
                strongNeg += 1
            features['positivecount'] = weakPos + (2 * strongPos)
            features['negativecount'] = weakNeg + (2 * strongNeg)      
    return features

In [29]:
SL_featuresets = [(SL_features(d, word_features, SL), c) for (d, c) in documents]

# show just the two sentiment lexicon features in document 0
print(SL_featuresets[0][0]['positivecount'])
print(SL_featuresets[0][0]['negativecount'])

0
6


### Cross-Validation

In [36]:
random.shuffle(SL_featuresets)
train_set, test_set = SL_featuresets[1000:], SL_featuresets[:1000]
cross_validation_accuracy(5,SL_featuresets)

0 0.7514071294559099
1 0.7640712945590994
2 0.7640712945590994
3 0.7696998123827392
4 0.7542213883677298
mean accuracy 0.7606941838649155


In [54]:
random.shuffle(SL_featuresets)
train_set, test_set = SL_featuresets[1000:], SL_featuresets[:1000]
cross_validation_accuracy(5,SL_featuresets)

0 0.7560975609756098
1 0.7589118198874296
2 0.7556285178236398
3 0.7673545966228893
4 0.7673545966228893
mean accuracy 0.7610694183864916


In [55]:
random.shuffle(SL_featuresets)
train_set, test_set = SL_featuresets[1000:], SL_featuresets[:1000]
cross_validation_accuracy(5,SL_featuresets)

0 0.7617260787992496
1 0.7584427767354597
2 0.7528142589118199
3 0.7668855534709194
4 0.7626641651031895
mean accuracy 0.7605065666041276


In [57]:
(0.7606941838649155+0.7610694183864916+0.7605065666041276)/3

0.7607567229518448

baseline=0.74659161976

bigram=0.74627892432

pos=0.74227642276

liwc=0.7459036898061289

sl=0.7607567229518448

