# SENTIMENT ANALYSIS

(via [these docs](http://www.nltk.org/howto/sentiment.html))  |  10-06-19

### STEP 1: Import ALL the things

In [1]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

### STEP 2: Borrow `subjective` and `objective` sentences from the nltk for practice

In [2]:
n_instances = 100
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

### STEP 3: Create `test` and `train` for both `subj` and `obj`

In [3]:
train_subj_docs = subj_docs[:80]
test_subj_docs = subj_docs[80:100]
train_obj_docs = obj_docs[:80]
test_obj_docs = obj_docs[80:100]

### STEP 4: Combine the two `test` and `train` sets 

In [4]:
training_docs = train_subj_docs + train_obj_docs
testing_docs = test_subj_docs + test_obj_docs
# training_docs

### STEP 5: Use `SentimentAnalyzer` to mark negation in training docs

In [5]:
sentim_analyzer = SentimentAnalyzer()
# WTF IS MARK_NEGATION
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])

In [6]:
training_docs

[(['smart',
   'and',
   'alert',
   ',',
   'thirteen',
   'conversations',
   'about',
   'one',
   'thing',
   'is',
   'a',
   'small',
   'gem',
   '.'],
  'subj'),
 (['color',
   ',',
   'musical',
   'bounce',
   'and',
   'warm',
   'seas',
   'lapping',
   'on',
   'island',
   'shores',
   '.',
   'and',
   'just',
   'enough',
   'science',
   'to',
   'send',
   'you',
   'home',
   'thinking',
   '.'],
  'subj'),
 (['it',
   'is',
   'not',
   'a',
   'mass-market',
   'entertainment',
   'but',
   'an',
   'uncompromising',
   'attempt',
   'by',
   'one',
   'artist',
   'to',
   'think',
   'about',
   'another',
   '.'],
  'subj'),
 (['a',
   'light-hearted',
   'french',
   'film',
   'about',
   'the',
   'spiritual',
   'quest',
   'of',
   'a',
   'fashion',
   'model',
   'seeking',
   'peace',
   'of',
   'mind',
   'while',
   'in',
   'a',
   'love',
   'affair',
   'with',
   'a',
   'veterinarian',
   'who',
   'is',
   'a',
   'non-practicing',
   'jew',
   

In [7]:
all_words_neg[25:45]

['.',
 'and',
 'just',
 'enough',
 'science',
 'to',
 'send',
 'you',
 'home',
 'thinking',
 '.',
 'it',
 'is',
 'not',
 'a_NEG',
 'mass-market_NEG',
 'entertainment_NEG',
 'but_NEG',
 'an_NEG',
 'uncompromising_NEG']

#### Note how this sentiment analyzer is marking everything after a negation word with '_NEG'
This is one of many ways we can determine sentiment

### STEP 6: Use `unigram_word_feats` to get unigrams features

In [8]:
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
len(unigram_feats)

83

### STEP 7: Use `add_feat_extractor` to get a feature-value representation of our data 
#### Apply to both `training_set` and `testing_set`

In [62]:
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

In [63]:
training_set = sentim_analyzer.apply_features(training_docs)
training_set[:1]

[({'contains(.)': True, 'contains(the)': False, 'contains(,)': True, 'contains(a)': True, 'contains(and)': True, 'contains(of)': False, 'contains(to)': False, 'contains(is)': True, 'contains(in)': False, 'contains(with)': False, 'contains(it)': False, 'contains(that)': False, 'contains(his)': False, 'contains(on)': False, 'contains(for)': False, 'contains(an)': False, 'contains(who)': False, 'contains(by)': False, 'contains(he)': False, 'contains(from)': False, 'contains(her)': False, 'contains(")': False, 'contains(film)': False, 'contains(as)': False, 'contains(this)': False, 'contains(movie)': False, 'contains(their)': False, 'contains(but)': False, 'contains(one)': True, 'contains(at)': False, 'contains(about)': True, 'contains(the_NEG)': False, 'contains(a_NEG)': False, 'contains(to_NEG)': False, 'contains(are)': False, "contains(there's)": False, 'contains(()': False, 'contains(story)': False, 'contains(when)': False, 'contains(so)': False, 'contains(be)': False, 'contains(,_NEG)

In [64]:
test_set = sentim_analyzer.apply_features(testing_docs)
test_set[:1]

[({'contains(.)': True, 'contains(the)': True, 'contains(,)': False, 'contains(a)': True, 'contains(and)': False, 'contains(of)': True, 'contains(to)': False, 'contains(is)': False, 'contains(in)': False, 'contains(with)': True, 'contains(it)': False, 'contains(that)': False, 'contains(his)': False, 'contains(on)': False, 'contains(for)': True, 'contains(an)': False, 'contains(who)': False, 'contains(by)': False, 'contains(he)': False, 'contains(from)': False, 'contains(her)': False, 'contains(")': False, 'contains(film)': False, 'contains(as)': False, 'contains(this)': False, 'contains(movie)': False, 'contains(their)': False, 'contains(but)': False, 'contains(one)': False, 'contains(at)': False, 'contains(about)': False, 'contains(the_NEG)': False, 'contains(a_NEG)': False, 'contains(to_NEG)': False, 'contains(are)': False, "contains(there's)": False, 'contains(()': False, 'contains(story)': False, 'contains(when)': False, 'contains(so)': False, 'contains(be)': False, 'contains(,_NEG

### STEP 8: FINAL STEP!! We use Naive Bayes to create a trainer and FINALLY classify our data!

In [65]:
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)

Training classifier


In [66]:
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
    print('{0}: {1}'.format(key,value))

Evaluating NaiveBayesClassifier results...
Accuracy: 0.8
F-measure [obj]: 0.8
F-measure [subj]: 0.8
Precision [obj]: 0.8
Precision [subj]: 0.8
Recall [obj]: 0.8
Recall [subj]: 0.8
