In [1]:
import nltk
import numpy as np

In [2]:
## must download vader_lexicon for vader sentiment algorithm
nltk.download_shell()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> l
Packages:
  [ ] abc................. Australian Broadcasting Commission 2006
  [ ] alpino.............. Alpino Dutch Treebank
  [ ] averaged_perceptron_tagger_ru Averaged Perceptron Tagger (Russian)
  [ ] basque_grammars..... Grammars for Basque
  [ ] biocreative_ppi..... BioCreAtIvE (Critical Assessment of Information
                           Extraction Systems in Biology)
  [ ] bllip_wsj_no_aux.... BLLIP Parser: WSJ Model
  [ ] book_grammars....... Grammars from NLTK Book
  [ ] brown............... Brown Corpus
  [ ] brown_tei........... Brown Corpus (TEI XML Version)
  [ ] cess_cat............ CESS-CAT Treebank
  [ ] cess_esp............ CESS-ESP Treebank
  [ ] chat80......

## Data Loading

Load clean, processed data

In [3]:
## array of strings, 1 x n 
descriptions = ["this wine is nice","that tastes like utter trash"]

## array of ints, 1 x n
prices = [100,10]

## Sentiment Analysis

Sentiment analysis algorithms. Each takes a single string as input.

In [9]:
def createDict():
    sentiment_dictionary = {}
    for line in open('afinn_dict.txt'):
        word,score = line.split('\t')
        sentiment_dictionary[word] = int(score)
    return sentiment_dictionary
sentiment_dictionary = createDict()

def sentimentAfinn(sentence):
    '''
    AFINN is a dictionary of polarity scores [-5,5] by word.
    This algorithm sums the scores for each word in the sentence, then
    classifies the entire sentence based on the sum's sign.
    '''
    sentence_tokens = sentence.split(' ')
    score = 0
    for token in sentence_tokens:
        score += sentiment_dictionary.get(token,0)
    ## if we want raw integer, not scaled
    return score
    ## if we want it to be scaled between -1 and 1
    # return np.sign(score)

In [5]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentim_int_analyzer = SentimentIntensityAnalyzer()
def sentimentVader(sentence):    
    '''
    Sentiment object contains { 'pos', 'neg', 'neu', 'compound' } where pos+neg+neu=1, compound is [-1,1].
    This algorithm returns the 'pos', 'neu', 'neg' values
    '''
    ss = sentim_int_analyzer.polarity_scores(sentence)
    return [ss['pos'], ss['neu'], ss['neg']]



In [None]:
from textblob import TextBlob
def textBlob(sentence):
    '''
    Sentiment object being calculated in API contains {
            subjectivity: [0,1], 
            polarity: [-1,1] }
    This algorithm maps the polarity score to separate pos, neu, neg values.
    If polarity is positive, the 'neg' value will be 0 and the pos and neu values will reflect magnitude of polarity;
    same idea used for if polarity is negative.
    '''
    textObj = TextBlob(sentence)
    score = textObj.sentiment.polarity
    if score > 0:
        return [score,1-score,0.]
    elif score < 0:
        return [0.,1+score,-score]
    else:
        return [0.,1.,0.]

Create sentiment feature vector.

In [10]:
sentiments_afinn = list(map(sentimentAfinn,descriptions))
sentiments_vader = list(map(sentimentVader,descriptions))

In [11]:
print(sentiments_afinn)
print(sentiments_vader)

[3, 2]
[[0.483, 0.517, 0.0], [0.385, 0.615, 0.0]]


## Neural Network