#### Running Vader on a set of IMDB reviews

*note:* please run the following cell with all the needed imports

In [8]:
import sklearn
import json
import numpy
import nltk
from nltk.sentiment import vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from sklearn.metrics import classification_report
import pandas as pd
import pathlib
from sklearn.datasets import load_files
import spacy
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

vader_model = SentimentIntensityAnalyzer()
nlp = spacy.load('en_core_web_sm') # 'en_core_web_sm'


In [9]:
def vader_output_to_label(vader_output):
    """
    map vader output e.g.,
    {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.4215}
    to one of the following values:
    a) positive float -> 'positive'
    b) 0.0 -> 'neutral'
    c) negative float -> 'negative'
    
    :param dict vader_output: output dict from vader
    
    :rtype: str
    :return: 'negative' | 'neutral' | 'positive'
    """
    compound = vader_output['compound']
    
    if compound < 0:
        return 'negative'
    elif compound == 0.0:
        return 'neutral'
    elif compound > 0.0:
        return 'positive'
    
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.0}) == 'neutral'
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.01}) == 'positive'
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': -0.01}) == 'negative'

#### Preparation of data:

In [10]:
#get imdbreviews folder path
cwd = pathlib.Path.cwd()
imdb_reviews_folder = cwd.joinpath('IMDB-reviews-Stanford')

#load files from the path
imdb_reviews = load_files(str(imdb_reviews_folder))

* Arrange data in a dataframe for easier handling

In [11]:
# get all texts and labels into a uniform dataframe from the folder

correct_labels = pd.DataFrame()
texts = [doc.decode('utf-8') for doc in imdb_reviews.data]
labels = imdb_reviews.target
label_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}
string_labels = [label_mapping[label] for label in labels]

correct_labels['text'] = texts
correct_labels['label'] = string_labels

correct_labels

Unnamed: 0,text,label
0,"Zero Day leads you to think, even re-think why...",neutral
1,Words can't describe how bad this movie is. I ...,negative
2,Everyone plays their part pretty well in this ...,neutral
3,There are a lot of highly talented filmmakers/...,negative
4,I've just had the evidence that confirmed my s...,negative
...,...,...
24995,089: Footlight Parade (1933) - released 9/30/1...,neutral
24996,Deeply humorous yet honest comedy about a bunc...,neutral
24997,1st watched 2/28/2006 - 4 out of 10(Dir-Sydney...,negative
24998,I watch lots of scary movies (or at least they...,negative


* Define function so we can run vader in different settings

In [12]:
# define function to run vader in different settings

def run_vader(textual_unit, 
              lemmatize=False, 
              parts_of_speech_to_consider=None,
              verbose=0):

    doc = nlp(textual_unit)
        
    input_to_vader = []

    for sent in doc.sents:
        for token in sent:

            to_add = token.text

            if lemmatize:
                to_add = token.lemma_

                if to_add == '-PRON-': 
                    to_add = token.text

            if parts_of_speech_to_consider:
                if token.pos_ in parts_of_speech_to_consider:
                    input_to_vader.append(to_add) 
            else:
                input_to_vader.append(to_add)

    scores = vader_model.polarity_scores(' '.join(input_to_vader))
    
    if verbose >= 1:
        print()
        print('INPUT SENTENCE', sent)
        print('INPUT TO VADER', input_to_vader)
        print('VADER OUTPUT', scores)

    return scores

* Run VADER (as it is) on the set of imdb reveiews

In [14]:
just_vader = pd.DataFrame()

sentences = []
labels = []

for sent in imdb_reviews.data:
    sent = sent.decode('utf-8')
    vader_output = vader_model.polarity_scores(sent)
    sentences.append(sent)
    labels.append(vader_output_to_label(vader_output))

just_vader['text'] = sentences
just_vader['label'] = labels

report = classification_report(correct_labels['label'], just_vader['label'], target_names=['positive', 'negative', 'neutral'])
print(report)
    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

    positive       0.78      0.54      0.64     12500
    negative       0.45      0.00      0.00     12500
     neutral       0.00      0.00      0.00         0

    accuracy                           0.27     25000
   macro avg       0.41      0.18      0.21     25000
weighted avg       0.62      0.27      0.32     25000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


* Run VADER on the set of imdb reviews after having lemmatized the text

In [15]:
lemmatized = pd.DataFrame()
sentences = []
labels = []

for sent in imdb_reviews.data:
    sent = sent.decode('utf-8')
    vader_output = run_vader(sent, lemmatize=True)
    sentences.append(sent)
    labels.append(vader_output_to_label(vader_output))

lemmatized['text'] = sentences
lemmatized['label'] = labels

report = classification_report(correct_labels['label'], lemmatized['label'], target_names=['positive', 'negative', 'neutral'])
print(report)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

    positive       0.78      0.55      0.64     12500
    negative       0.55      0.00      0.00     12500
     neutral       0.00      0.00      0.00         0

    accuracy                           0.27     25000
   macro avg       0.44      0.18      0.21     25000
weighted avg       0.66      0.27      0.32     25000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


* Run VADER on the set of imdb reviews with only adjectives

In [16]:
adjectives = pd.DataFrame()
sentences = []
labels = []

for sent in imdb_reviews.data:
    sent = sent.decode('utf-8')
    vader_output = run_vader(sent, parts_of_speech_to_consider={'ADJ'})
    sentences.append(sent)
    labels.append(vader_output_to_label(vader_output))

adjectives['text'] = sentences
adjectives['label'] = labels

report = classification_report(correct_labels['label'], adjectives['label'], target_names=['positive', 'negative', 'neutral'])
print(report)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

    positive       0.82      0.50      0.62     12500
    negative       0.49      0.01      0.03     12500
     neutral       0.00      0.00      0.00         0

    accuracy                           0.26     25000
   macro avg       0.44      0.17      0.22     25000
weighted avg       0.65      0.26      0.32     25000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


* Run VADER on the set of imdb reviews with only nouns

In [17]:
nouns = pd.DataFrame()
sentences = []
labels = []

for sent in imdb_reviews.data:
    sent = sent.decode('utf-8')
    vader_output = run_vader(sent, parts_of_speech_to_consider={'NOUN'})
    sentences.append(sent)
    labels.append(vader_output_to_label(vader_output))

nouns['text'] = sentences
nouns['label'] = labels

report = classification_report(correct_labels['label'], nouns['label'], target_names=['positive', 'negative', 'neutral'])
print(report)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

    positive       0.60      0.49      0.54     12500
    negative       0.51      0.08      0.14     12500
     neutral       0.00      0.00      0.00         0

    accuracy                           0.29     25000
   macro avg       0.37      0.19      0.23     25000
weighted avg       0.55      0.29      0.34     25000



* Run VADER on the set of imdb reviews with only verbs

In [18]:
verbs = pd.DataFrame()
sentences = []
labels = []

for sent in imdb_reviews.data:
    sent = sent.decode('utf-8')
    vader_output = run_vader(sent, parts_of_speech_to_consider={'VERB'})
    sentences.append(sent)
    labels.append(vader_output_to_label(vader_output))

verbs['text'] = sentences
verbs['label'] = labels

report = classification_report(correct_labels['label'], verbs['label'], target_names=['positive', 'negative', 'neutral'])
print(report)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

    positive       0.62      0.48      0.54     12500
    negative       0.51      0.08      0.14     12500
     neutral       0.00      0.00      0.00         0

    accuracy                           0.28     25000
   macro avg       0.38      0.18      0.23     25000
weighted avg       0.57      0.28      0.34     25000

