In [7]:
# import library
!pip install nltk
import nltk
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import re

# download APIs for natural language processing
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker')
nltk.download('stopwords')
nltk.download('words')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu
[0m

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk

True

In [8]:
import warnings as wrn
wrn.filterwarnings('ignore')
# read data from CSV file
df = pd.read_csv('../data/raw/dataTotal.csv')

print("Before:" + df["review"][1] + " ==> " + df["sentiment"][1])
df = df.sample(frac=1).reset_index(drop=True)
print("After:" + df["review"][1] + " ==> " + df["sentiment"][1])

reviews = df["review"].tolist()
sentiments = df["sentiment"].tolist()

Before:This movie could have been better. ==> neutral
After:The Silver Surfer is a dude, so no. The actors don't look well cast for this. It may be great, but I'm left unenthusiastic. ==> negative


In [9]:
# create pipeline for processing data
# import word tokenizer for tokenization process and pos_tag for speech tagging 
from nltk.tokenize import word_tokenize
from nltk import pos_tag

def preprocess_data(review):
    # Cleaning links
    text = re.sub(r'http\S+', '', review)
    # Cleaning everything except alphabetical and numerical characters
    text = re.sub("[^a-zA-Z0-9]"," ",review)
    
    # Tokenization
    token = word_tokenize(text)
    # tagged = pos_tag(token)
    # keep adjactive (JJ), adverb (RB) and verb (VB)
    # important_tokens = [w for w, pos in tagged if pos.startswith("JJ") or pos.startswith("RB") or pos.startswith("VB") or pos.startswith("NN")]
    preprocessed_text = " ".join(token)
    return preprocessed_text

def sentiment_vader_lexicon(review, threshold=0.0, verbose=False):
    # analyze the sentiment for review
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)
    # get aggregate scores and final sentiment
    agg_score = scores['compound']
    final_sentiment = 'neutral'
    if agg_score > threshold+0.1:
        final_sentiment = 'positive'
    elif agg_score < threshold:
        final_sentiment = 'negative'
                                
    if verbose:
        # display detailed sentiment statistics
        final = round(agg_score, 2)
        sentiment_frame = pd.DataFrame([[final_sentiment, final]], 
                                        columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'], 
                                                                      ['Predicted Sentiment', 'Score']],
                                                              codes=[[0,0],[0,1]]))
        
        print(sentiment_frame)
    
    return final_sentiment

In [10]:
preprocessed_text = [preprocess_data(text) for text in reviews]
# print(preprocessed_text)

for review, sentiment in zip(preprocessed_text[300:400], sentiments[300:400]):
    print('Review:', review)
    print('Sentiment:', sentiment)
    pred = sentiment_vader_lexicon(review, threshold=0.0, verbose=True)    
    print('-'*60)

predicted_sentiments = [sentiment_vader_lexicon(review, threshold=0.0, verbose=False) for review in preprocessed_text]
# print(predicted_sentiments)

Review: God this movie was disappointing first go around
Sentiment: negative
     SENTIMENT STATS:      
  Predicted Sentiment Score
0            negative -0.27
------------------------------------------------------------
Review: Not bad MCU are still not done yet
Sentiment: neutral
     SENTIMENT STATS:      
  Predicted Sentiment Score
0            positive  0.43
------------------------------------------------------------
Review: I can see why this did so bad looks awful
Sentiment: negative
     SENTIMENT STATS:      
  Predicted Sentiment Score
0            negative -0.83
------------------------------------------------------------
Review: This movie is absolutely boring It s what to expect when you re expecting a cosmic baby and an advertisement for ABC They should ve left the Red Ghost in it and made it a show on Disney plus The only reason why I didn t fall asleep in the theater is because I think the employees were looking for someone and came in several times to try to find th

In [11]:
# labels = ['negative', 'neutral', 'positive']
print(classification_report(sentiments, predicted_sentiments))
# pd.DataFrame(confusion_matrix(test_sentiments, predicted_sentiments), index=labels, columns=labels)

              precision    recall  f1-score   support

    negative       0.65      0.65      0.65       341
     neutral       0.53      0.28      0.37       301
    positive       0.55      0.79      0.65       322

    accuracy                           0.58       964
   macro avg       0.58      0.57      0.55       964
weighted avg       0.58      0.58      0.56       964



In [12]:
# Predict sentiment and test model
reviews = []
reviews.append("this seem to be ok, not so intersting")
reviews.append("no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT!")
reviews.append("I don't care if some people voted this movie to be bad. If you want the Truth this is a Very Good Movie! It has every thing a movie should have. You really should Get this one")
reviews.append("Worst horror film ever but funniest film ever rolled in one you have got to see this film it is so cheap it is unbeliaveble but you have to see it really!!!! P.s watch the carrot")

for new_review in reviews:
    new_preprocessed = preprocess_data(new_review)
    
    print('Review:', new_review)
    pred = sentiment_vader_lexicon(new_preprocessed, threshold=0.0, verbose=True)    
    print('-'*60)

Review: this seem to be ok, not so intersting
     SENTIMENT STATS:      
  Predicted Sentiment Score
0            positive   0.3
------------------------------------------------------------
Review: no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT!
     SENTIMENT STATS:      
  Predicted Sentiment Score
0            negative -0.87
------------------------------------------------------------
Review: I don't care if some people voted this movie to be bad. If you want the Truth this is a Very Good Movie! It has every thing a movie should have. You really should Get this one
     SENTIMENT STATS:      
  Predicted Sentiment Score
0            positive  0.67
------------------------------------------------------------
Review: Worst horror film ever but funniest film ever rolled in one you have got to see this film it is so cheap it is unbeliaveble but you have to see it really!!!! P.s watch the carrot
     SENTIMENT STATS:      
  Predicted Senti