In [6]:
doc_a = "great for starters"
doc_b = "great phone"
doc_c = "fine quality"
doc_d = "fine product"
doc_e = "poor quality"
doc_f = "poor phone"


train_doc = [doc_a,doc_b,doc_c,doc_d,doc_e,doc_f]
train_target = [5,5,3,3,1,1]
#Text preprocessing, tokenizing and filtering of stopwords are included in a high level 
#component that is able to build a dictionary of features and transform documents to feature vectors:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_doc)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


In [7]:
#Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, train_target)

In [9]:
docs_new = ['great for use', 'fine product','poor phone']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)
print predicted

[5 3 1]


In [53]:
#All of the NLTK classifiers work with featstructs, which can be simple dictionaries 
#mapping a feature name to a feature value. 
#For text, we’ll use a simplified bag of words model where every word is feature name 
#with a value of True. 

import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import names
 
def word_feats(words):
    return dict([(words, True)])
 
positive_vocab = ['awesome','outstanding', 'like', 'fantastic', 'terrific', 'good', 'nice', 'great']
negative_vocab = ['bad', 'terrible','useless', 'hate', ':(']
neutral_vocab = ['movie','the','sound','was','is','actors','did','know','words','not']


positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab]
negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab]
neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab]

print positive_features
print negative_features
print neutral_features
 
train_set = negative_features + positive_features + neutral_features

print train_set
 
classifier = NaiveBayesClassifier.train(train_set) 
 
# Predict
neg = 0
pos = 0
neu = 0
sentence = "Awesome movie, I like it"
sentence = sentence.lower()
words = sentence.split(' ')
print words
for word in words:
    classResult = classifier.classify( word_feats(word))
    print word
    print classResult
    if classResult == 'neg':
        neg = neg + 1
    if classResult == 'pos':
        pos = pos + 1
    if classResult == 'neu':
        neu = neu + 1

print "Subjectivity"
print 'Neural: ' + str(float(neu)/len(words))
print 'Polar: ' + str(1-float(neu)/len(words))

print

print "Polarity"
print 'Positive: ' + str(float(pos)/(float(pos)+float(neg)))
print 'Negative: ' + str(float(neg)/(float(pos)+float(neg)))


[({'awesome': True}, 'pos'), ({'outstanding': True}, 'pos'), ({'like': True}, 'pos'), ({'fantastic': True}, 'pos'), ({'terrific': True}, 'pos'), ({'good': True}, 'pos'), ({'nice': True}, 'pos'), ({'great': True}, 'pos')]
[({'bad': True}, 'neg'), ({'terrible': True}, 'neg'), ({'useless': True}, 'neg'), ({'hate': True}, 'neg'), ({':(': True}, 'neg')]
[({'movie': True}, 'neu'), ({'the': True}, 'neu'), ({'sound': True}, 'neu'), ({'was': True}, 'neu'), ({'is': True}, 'neu'), ({'actors': True}, 'neu'), ({'did': True}, 'neu'), ({'know': True}, 'neu'), ({'words': True}, 'neu'), ({'not': True}, 'neu')]
[({'bad': True}, 'neg'), ({'terrible': True}, 'neg'), ({'useless': True}, 'neg'), ({'hate': True}, 'neg'), ({':(': True}, 'neg'), ({'awesome': True}, 'pos'), ({'outstanding': True}, 'pos'), ({'like': True}, 'pos'), ({'fantastic': True}, 'pos'), ({'terrific': True}, 'pos'), ({'good': True}, 'pos'), ({'nice': True}, 'pos'), ({'great': True}, 'pos'), ({'movie': True}, 'neu'), ({'the': True}, 'neu'),

In [11]:
#Use NLTK movie review data sets for sentiment analysis
#Use 75% of the data set as the training data, and the rest 25% as the test data set.

#NLTK comes with all the pieces you need to get started on sentiment analysis: 
#a movie reviews corpus with reviews categorized into pos and neg categories, 
#and a number of trainable classifiers. 
#We’ll start with a simple NaiveBayesClassifier as a baseline, using boolean word feature extraction.

import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
 
def word_feats(words):
    return dict([(word, True) for word in words])
 
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

#for f in negids:
#    print ' '.join(movie_reviews.words(fileids=[f]))
 
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
 
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
 
classifier = NaiveBayesClassifier.train(trainfeats)
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
classifier.show_most_informative_features()

train on 1500 instances, test on 500 instances
accuracy: 0.728
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0
