In [50]:
import pandas as pd
import numpy as np
import text_normalizer as tn
import model_evaluation_utils as meu

import matplotlib.pyplot as plt

import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from afinn import Afinn

np.set_printoptions(precision=2,linewidth=80)

In [22]:
dataset = pd.read_csv("movie_reviews.csv")

reviews = np.array(dataset["review"])
sentiments = np.array(dataset["sentiment"])

test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]
sample_reviews_ids = [726, 3533, 13010]

#normalize dataset
norm_test_reviews = tn.normalize_corpus(test_reviews)

In [23]:
# Afinn Lexicon
afn = Afinn(emoticons=True)

for review, sentiment in zip(test_reviews[sample_reviews_ids], test_sentiments[sample_reviews_ids]):
    print("Review:", review)
    print("Actual Sentiment", sentiment)
    print("Predicted Sentiment Polarity", afn.score(review))
    print("-"*60)

Review: Very simply, they are all the syndicated episodes and NOT the original uncut/unedited NBC episodes. It is NOT the complete first season, all eps are edited to conform to 21:00 for syndication meaning jokes are cut, an extra commercial fade is included, all of the Harvey Korman intros are not here...very poorly done! Shame on a series I've been waiting for....booooooooooooooooo! If you're a true die hard Mama fan, don't buy this and go to http://www2.warnerbros.com/web/main/help/whv/customer_service.jsp and send them comments on why we're unhappy on this butcher job to a classic sitcom!
Actual Sentiment negative
Predicted Sentiment Polarity -2.0
------------------------------------------------------------
Review: I don't care if some people voted this movie to be bad. If you want the Truth this is a Very Good Movie! It has every thing a movie should have. You really should Get this one.
Actual Sentiment positive
Predicted Sentiment Polarity 3.0
----------------------------------

In [24]:
sentiment_polarity = [afn.score(review) for review in test_reviews]
predicted_sentiments = ["positive" if score >=1.0 else "negative" for score in sentiment_polarity]

In [42]:
meu.display_model_performance_metrics(true_labels=test_sentiments, 
                                       predicted_labels=predicted_sentiments,
                                       classes=["positive", "negative"])

Model Performance metrics:
------------------------------
Accuracy: 0.7118
Precision: 0.7289
Recall: 0.7118
F1 Score: 0.7062

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.67      0.85      0.75      7510
    negative       0.79      0.57      0.67      7490

    accuracy                           0.71     15000
   macro avg       0.73      0.71      0.71     15000
weighted avg       0.73      0.71      0.71     15000


Prediction Confusion Matrix:
------------------------------


TypeError: __new__() got an unexpected keyword argument 'labels'

In [None]:
# SentiWordNet Lexicon
awesome = list(swn.sent_synsets("awesome", "a"))[0]
print("Positive Polarity Score:", awesome.pos_score())
print("Negative Polarity Score", awesome.neg_score())
print("Objective Score:", awesome.obj_score())

In [40]:
def analyze_sentiment_sentiwordnet_lexicon(review,
                                           verbose=False):

    # tokenize and POS tag text tokens
    tagged_text = [(token.text, token.tag_) for token in tn.nlp(review)]
    pos_score = neg_score = token_count = obj_score = 0
    # get wordnet synsets based on POS tags
    # get sentiment scores if synsets are found
    for word, tag in tagged_text:
        ss_set = None
        if 'NN' in tag and list(swn.senti_synsets(word, 'n')):
            ss_set = list(swn.senti_synsets(word, 'n'))[0]
        elif 'VB' in tag and list(swn.senti_synsets(word, 'v')):
            ss_set = list(swn.senti_synsets(word, 'v'))[0]
        elif 'JJ' in tag and list(swn.senti_synsets(word, 'a')):
            ss_set = list(swn.senti_synsets(word, 'a'))[0]
        elif 'RB' in tag and list(swn.senti_synsets(word, 'r')):
            ss_set = list(swn.senti_synsets(word, 'r'))[0]
        # if senti-synset is found        
        if ss_set:
            # add scores for all found synsets
            pos_score += ss_set.pos_score()
            neg_score += ss_set.neg_score()
            obj_score += ss_set.obj_score()
            token_count += 1
    
    # aggregate final scores
    final_score = pos_score - neg_score
    norm_final_score = round(float(final_score) / token_count, 2)
    final_sentiment = 'positive' if norm_final_score >= 0 else 'negative'
    if verbose:
        norm_obj_score = round(float(obj_score) / token_count, 2)
        norm_pos_score = round(float(pos_score) / token_count, 2)
        norm_neg_score = round(float(neg_score) / token_count, 2)
        # to display results in a nice table
        sentiment_frame = pd.DataFrame([[final_sentiment, norm_obj_score, norm_pos_score, 
                                         norm_neg_score, norm_final_score]],
                                       columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'], 
                                                             ['Predicted Sentiment', 'Objectivity',
                                                              'Positive', 'Negative', 'Overall']], 
                                                             codes=[[0,0,0,0,0],[0,1,2,3,4]]))
        print(sentiment_frame)
        
    return final_sentiment

In [41]:
for review, sentiment in zip(test_reviews[sample_reviews_ids], test_sentiments[sample_reviews_ids]):
    print("Review:", review)
    print("Actual Sentiment:", sentiment)
    pred = analyze_sentiment_sentiwordnet_lexicon(review, verbose=True)
    print("-"*60)

Review: Very simply, they are all the syndicated episodes and NOT the original uncut/unedited NBC episodes. It is NOT the complete first season, all eps are edited to conform to 21:00 for syndication meaning jokes are cut, an extra commercial fade is included, all of the Harvey Korman intros are not here...very poorly done! Shame on a series I've been waiting for....booooooooooooooooo! If you're a true die hard Mama fan, don't buy this and go to http://www2.warnerbros.com/web/main/help/whv/customer_service.jsp and send them comments on why we're unhappy on this butcher job to a classic sitcom!
Actual Sentiment: negative
     SENTIMENT STATS:                                      
  Predicted Sentiment Objectivity Positive Negative Overall
0            negative        0.77     0.09     0.14   -0.04
------------------------------------------------------------
Review: I don't care if some people voted this movie to be bad. If you want the Truth this is a Very Good Movie! It has every thing

In [49]:
predicted_sentiments = [analyze_sentiment_sentiwordnet_lexicon(review, verbose=False) for review in norm_test_reviews]
meu.display_model_performance_metrics(true_labels=test_sentiments, 
                                      predicted_labels=predicted_sentiments,
                                      classes=["positive", "negative"])

Model Performance metrics:
------------------------------
Accuracy: 0.6843
Precision: 0.687
Recall: 0.6843
F1 Score: 0.6831

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.66      0.75      0.70      7510
    negative       0.71      0.62      0.66      7490

    accuracy                           0.68     15000
   macro avg       0.69      0.68      0.68     15000
weighted avg       0.69      0.68      0.68     15000


Prediction Confusion Matrix:
------------------------------


TypeError: __new__() got an unexpected keyword argument 'labels'

In [54]:
# VADER Lexicon
def analyze_sentiment_vader_lexicon(review, threshold=.1,verbose=False):
    
    # pre-process text
    review = tn.strip_html_tags(review)
    review = tn.remove_accented_chars(review)
    review = tn.expand_contractions(review)
    
    # analyze the sentiment for review
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)
    # get aggregate scores and final sentiment
    agg_score = scores["compound"]
    final_sentiment = "positive" if agg_score >= threshold else "negative"
    
    if verbose:
        # display detailed sentiment statistics
        positive = str(round(scores["pos"],2)*100)+"%"
        final = round(agg_score, 2)
        negative = str(round(scores["neg"], 2)*100)+"%"
        neutral = str(round(scores["neu"], 2)*100)+"%"
        sentiment_frame = pd.DataFrame([[final_sentiment, final, positive, negative, neutral]], 
                                       columns=pd.MultiIndex(
                                           levels=[["SENTIMENT STATS:"],
                                                   ["Predicted Sentiment", "Polarity Score", "Positive", "Negative", "Neutral"]
                                                  ], codes=[[0,0,0,0,0], [0,1,2,3,4]])
                                      )
        print(sentiment_frame)
    
    return final_sentiment

In [55]:
for review, sentiment in zip(test_reviews[sample_reviews_ids], test_sentiments[sample_reviews_ids]):
    print("REVIEW:", review)
    print("Actual Sentiment:", sentiment)
    pred = analyze_sentiment_vader_lexicon(review, threshold=.4, verbose=True)
    print("-"*60)

REVIEW: Very simply, they are all the syndicated episodes and NOT the original uncut/unedited NBC episodes. It is NOT the complete first season, all eps are edited to conform to 21:00 for syndication meaning jokes are cut, an extra commercial fade is included, all of the Harvey Korman intros are not here...very poorly done! Shame on a series I've been waiting for....booooooooooooooooo! If you're a true die hard Mama fan, don't buy this and go to http://www2.warnerbros.com/web/main/help/whv/customer_service.jsp and send them comments on why we're unhappy on this butcher job to a classic sitcom!
Actual Sentiment: negative
     SENTIMENT STATS:                                                    
  Predicted Sentiment Polarity Score            Positive Negative Neutral
0            negative          -0.84  7.000000000000001%    15.0%   78.0%
------------------------------------------------------------
REVIEW: I don't care if some people voted this movie to be bad. If you want the Truth thi

In [56]:
predicted_sentiments = [analyze_sentiment_vader_lexicon(review, threshold=.4) for review in test_reviews]
meu.display_model_performance_metrics(true_labels=test_sentiments,
                                      predicted_labels=predicted_sentiments,
                                      classes=["positive", "negative"])

Model Performance metrics:
------------------------------
Accuracy: 0.7108
Precision: 0.7237
Recall: 0.7108
F1 Score: 0.7065

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.67      0.83      0.74      7510
    negative       0.78      0.59      0.67      7490

    accuracy                           0.71     15000
   macro avg       0.72      0.71      0.71     15000
weighted avg       0.72      0.71      0.71     15000


Prediction Confusion Matrix:
------------------------------


TypeError: __new__() got an unexpected keyword argument 'labels'