# Sentiment Model Survey

In [40]:
import json
import pandas as pd
import itertools
import sys
import re
import string
import nltk
from collections import defaultdict
from sklearn.metrics import classification_report, accuracy_score, f1_score

## Preprocessng and Metrics Functions

In [61]:
import re
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer

## clean up strings
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"(.)\1\1+$", r"\1", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

## preprocess punctutation, stop words, stemming
def preprocess_stop_stem(text, punct=True, stem=False, stop=True, sent=False):
    if punct:
        regex = re.compile('[%s]' % re.escape(string.punctuation))
        text = regex.sub('', text)
    tokens = word_tokenize(text) 
    if stop:
        stop = stopwords.words('english')
        tokens =[word for word in tokens if word not in stop]
        tokens = [word.lower() for word in tokens]
    if stem:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(t) for t in tokens]
    if sent:
        tokens = ' '.join(tokens)
    return tokens

## preproces a sentence
def preprocess_doc(doc):
    return preprocess_stop_stem(clean_str(doc), punct=True, stem=False, stop=True, sent=True)

## label to sentiment
def create_senti_label(row):
    if row['rating'] >= 3.2:
        label = 'pos'
    elif row['rating'] <= 1.8:
        label = 'neg'
    else:
        label = 'neu'
    return label

## Sentiment score
def sentiment_result(score):
    if score >= 0.5:
        return 'pos'
    elif score > -0.5 and score < 0.5:
        return 'neu'
    elif score <= -0.5:
        return 'neg'


## Vader Lexicon (Rule Based Models)

We can use the "rating" the check the accuracy of the sentiment score

In [42]:
reviewData_df = pd.read_csv("product_reviews.csv")
reviewData_df.head()

Unnamed: 0,rating,ratingDate,reviewComments,reviewTitle,modelId
0,4,08/15/2017,beautifullllllllllllllllllllllllllllllllllllll...,,4002178_W
1,1,05/08/2017,The actual product came out looking much diffe...,Disappointing,4002178_W
2,1,04/10/2017,These shoes look nothing like the picture! I e...,,4002178_W
3,1,02/26/2017,I ordered this shoe because i loved the displa...,color sample was way off,4002178_W
4,5,09/15/2017,They are comfortable sneakers for working out ...,awesome sneakers,4002179_W


In [43]:
reviewData_df['reviewComments'] = reviewData_df['reviewComments'].apply(preprocess_doc)
reviewData_df['label'] = reviewData_df.apply(create_senti_label, axis = 1)
reviewData_df.head()

Unnamed: 0,rating,ratingDate,reviewComments,reviewTitle,modelId,label
0,4,08/15/2017,beautiful,,4002178_W,pos
1,1,05/08/2017,actual product came looking much different onl...,Disappointing,4002178_W,neg
2,1,04/10/2017,shoes look nothing like picture expected grey ...,,4002178_W,neg
3,1,02/26/2017,ordered shoe loved displayed blush pink color ...,color sample was way off,4002178_W,neg
4,5,09/15/2017,comfortable sneakers working running,awesome sneakers,4002179_W,pos


In [44]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#### Test a few samples

In [45]:
sentences = [
                "The plot was good, but the characters are uncompelling and the dialog is not great.", 
                "A really bad, horrible book.",       
                "At least it isn't a horrible book."
            ]

In [46]:
analyzer = SentimentIntensityAnalyzer()
for sentence in sentences:
    vs = analyzer.polarity_scores(sentence)
    print("{:-<65} {}".format(sentence, str(vs)))

The plot was good, but the characters are uncompelling and the dialog is not great. {'neg': 0.327, 'neu': 0.579, 'pos': 0.094, 'compound': -0.7042}
A really bad, horrible book.------------------------------------- {'neg': 0.791, 'neu': 0.209, 'pos': 0.0, 'compound': -0.8211}
At least it isn't a horrible book.------------------------------- {'neg': 0.0, 'neu': 0.637, 'pos': 0.363, 'compound': 0.431}


#### Vader sentiment Scoring of all reviews

In [47]:
senti_score = []
count = 0
for review in reviewData_df.loc[:, 'reviewComments']:
    vs = analyzer.polarity_scores(review)
    senti_score.append(sentiment_result(vs['compound']))

In [48]:
reviewData_df['sentiment'] = senti_score
reviewData_df.head()

Unnamed: 0,rating,ratingDate,reviewComments,reviewTitle,modelId,label,sentiment
0,4,08/15/2017,beautiful,,4002178_W,pos,pos
1,1,05/08/2017,actual product came looking much different onl...,Disappointing,4002178_W,neg,neu
2,1,04/10/2017,shoes look nothing like picture expected grey ...,,4002178_W,neg,neg
3,1,02/26/2017,ordered shoe loved displayed blush pink color ...,color sample was way off,4002178_W,neg,neu
4,5,09/15/2017,comfortable sneakers working running,awesome sneakers,4002179_W,pos,pos


In [49]:
print classification_report(reviewData_df['label'], reviewData_df['sentiment'])

             precision    recall  f1-score   support

        neg       0.41      0.16      0.23       146
        neu       0.21      0.40      0.28       365
        pos       0.93      0.87      0.90      3777

avg / total       0.85      0.81      0.82      4288



** The Vader sentiment does a good job for positive sentiment but does poorly for negtaive and neutrual **


## Text blob (Pattern based Model)

Reload data

In [50]:
reviewData_df = pd.read_csv("product_reviews.csv")
reviewData_df.head()

Unnamed: 0,rating,ratingDate,reviewComments,reviewTitle,modelId
0,4,08/15/2017,beautifullllllllllllllllllllllllllllllllllllll...,,4002178_W
1,1,05/08/2017,The actual product came out looking much diffe...,Disappointing,4002178_W
2,1,04/10/2017,These shoes look nothing like the picture! I e...,,4002178_W
3,1,02/26/2017,I ordered this shoe because i loved the displa...,color sample was way off,4002178_W
4,5,09/15/2017,They are comfortable sneakers for working out ...,awesome sneakers,4002179_W


In [51]:
reviewData_df['reviewComments'] = reviewData_df['reviewComments'].apply(preprocess_doc)
reviewData_df['label'] = reviewData_df.apply(create_senti_label, axis = 1)
reviewData_df.head()

Unnamed: 0,rating,ratingDate,reviewComments,reviewTitle,modelId,label
0,4,08/15/2017,beautiful,,4002178_W,pos
1,1,05/08/2017,actual product came looking much different onl...,Disappointing,4002178_W,neg
2,1,04/10/2017,shoes look nothing like picture expected grey ...,,4002178_W,neg
3,1,02/26/2017,ordered shoe loved displayed blush pink color ...,color sample was way off,4002178_W,neg
4,5,09/15/2017,comfortable sneakers working running,awesome sneakers,4002179_W,pos


In [52]:
from textblob import TextBlob

#### Test a few samples

In [53]:
sentences = [
                "The plot was good, but the characters are uncompelling and the dialog is not great.", 
                "A really bad, horrible book.",       
                "At least it isn't a horrible book."
            ]

In [54]:
for sentence in sentences:
    testimonial = TextBlob(sentence)
    print testimonial.sentiment

Sentiment(polarity=0.14999999999999997, subjectivity=0.675)
Sentiment(polarity=-0.8499999999999999, subjectivity=0.8333333333333333)
Sentiment(polarity=-0.65, subjectivity=0.7)


In [55]:
senti_score = []
count = 0
for review in reviewData_df.loc[:, 'reviewComments']:
    testimonial = TextBlob(review)
    senti_score.append(sentiment_result(testimonial.sentiment.polarity))

In [56]:
reviewData_df['sentiment'] = senti_score
reviewData_df.head()

Unnamed: 0,rating,ratingDate,reviewComments,reviewTitle,modelId,label,sentiment
0,4,08/15/2017,beautiful,,4002178_W,pos,pos
1,1,05/08/2017,actual product came looking much different onl...,Disappointing,4002178_W,neg,neu
2,1,04/10/2017,shoes look nothing like picture expected grey ...,,4002178_W,neg,neu
3,1,02/26/2017,ordered shoe loved displayed blush pink color ...,color sample was way off,4002178_W,neg,neu
4,5,09/15/2017,comfortable sneakers working running,awesome sneakers,4002179_W,pos,neu


In [57]:
print classification_report(reviewData_df['label'], reviewData_df['sentiment'])

             precision    recall  f1-score   support

        neg       0.40      0.04      0.07       146
        neu       0.11      0.92      0.20       365
        pos       0.97      0.32      0.48      3777

avg / total       0.88      0.36      0.44      4288



** The textblob sentiment did worse than Vader, especially in negative and neural reviews**

## Pattern Model

Reload data

In [63]:
reviewData_df = pd.read_csv("product_reviews.csv")
reviewData_df.head()

Unnamed: 0,rating,ratingDate,reviewComments,reviewTitle,modelId
0,4,08/15/2017,beautifullllllllllllllllllllllllllllllllllllll...,,4002178_W
1,1,05/08/2017,The actual product came out looking much diffe...,Disappointing,4002178_W
2,1,04/10/2017,These shoes look nothing like the picture! I e...,,4002178_W
3,1,02/26/2017,I ordered this shoe because i loved the displa...,color sample was way off,4002178_W
4,5,09/15/2017,They are comfortable sneakers for working out ...,awesome sneakers,4002179_W


In [64]:
reviewData_df['reviewComments'] = reviewData_df['reviewComments'].apply(preprocess_doc)
reviewData_df['label'] = reviewData_df.apply(create_senti_label, axis = 1)
reviewData_df.head()

Unnamed: 0,rating,ratingDate,reviewComments,reviewTitle,modelId,label
0,4,08/15/2017,beautiful,,4002178_W,pos
1,1,05/08/2017,actual product came looking much different onl...,Disappointing,4002178_W,neg
2,1,04/10/2017,shoes look nothing like picture expected grey ...,,4002178_W,neg
3,1,02/26/2017,ordered shoe loved displayed blush pink color ...,color sample was way off,4002178_W,neg
4,5,09/15/2017,comfortable sneakers working running,awesome sneakers,4002179_W,pos


In [65]:
from pattern.en import sentiment

In [66]:
sentences = [
                "The plot was good, but the characters are uncompelling and the dialog is not great.", 
                "A really bad, horrible book.",       
                "At least it isn't a horrible book."
            ]

In [67]:
for sentence in sentences:
    pattern_senti = sentiment(sentence)
    print pattern_senti

(0.14999999999999997, 0.675)
(-0.8499999999999999, 0.8333333333333333)
(0.1, 0.7)


In [68]:
senti_score = []
count = 0
for review in reviewData_df.loc[:, 'reviewComments']:
    pattern_senti = sentiment(review)
    senti_score.append(sentiment_result(pattern_senti[0]))

In [69]:
reviewData_df['sentiment'] = senti_score
reviewData_df.head()

Unnamed: 0,rating,ratingDate,reviewComments,reviewTitle,modelId,label,sentiment
0,4,08/15/2017,beautiful,,4002178_W,pos,pos
1,1,05/08/2017,actual product came looking much different onl...,Disappointing,4002178_W,neg,neu
2,1,04/10/2017,shoes look nothing like picture expected grey ...,,4002178_W,neg,neu
3,1,02/26/2017,ordered shoe loved displayed blush pink color ...,color sample was way off,4002178_W,neg,neu
4,5,09/15/2017,comfortable sneakers working running,awesome sneakers,4002179_W,pos,neu


In [70]:
print classification_report(reviewData_df['label'], reviewData_df['sentiment'])

             precision    recall  f1-score   support

        neg       0.40      0.04      0.07       146
        neu       0.11      0.92      0.20       365
        pos       0.97      0.32      0.48      3777

avg / total       0.88      0.36      0.44      4288



## Additional Resource

https://github.com/laugustyniak/awesome-sentiment-analysis

### Options:

* Use APIs: There are many APIs to use but they are not free.
* Train a sentiment model ideal choice but it will take time.