In [35]:
# import library
!pip install nltk
import nltk
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import re
import warnings as wrn
# import word tokenizer for tokenization process and pos_tag for speech tagging 
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords


# download APIs for natural language processing
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker')
nltk.download('stopwords')
nltk.download('words')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu
[0m

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk

True

In [50]:
wrn.filterwarnings('ignore')
# read data from CSV file
df = pd.read_csv('../data/raw/dataTotal.csv')

print("Before:" + df["review"][1] + " ==> " + df["sentiment"][1])
df_random = df.sample(frac=1).reset_index(drop=True)
print("After:" + df_random["review"][1] + " ==> " + df_random["sentiment"][1])

reviews = df_random["review"].tolist()
sentiments = df_random["sentiment"].tolist()

Before:This movie could have been better. ==> neutral
After:This movie was very enjoyable! My grandmother and I loved Brave New World so much! We loved the action, the direction, and even can't get enough with Harrison Ford too! It was marvelous! ==> positive


In [51]:
# convert label from text to digit process
positives = df[df["sentiment"] == "positive"] # group all the positive comments
negatives = df[df["sentiment"] == "negative"] # group all the negative comments
neutrals = df[df["sentiment"] == "neutral"]   # group all the neutral comments

# Combine pos, neg, neutral data into one DataFrame data using pd.concat()
data = pd.concat([positives,
                  neutrals,
                  negatives,
                 ],axis=0)

data.reset_index(inplace=True) # reset the index of each row

# split train data and test data 20% testing and 80% training
X_train, X_test, y_train, y_test = train_test_split(data["review"], data["sentiment"], test_size=0.2, random_state=42)
print(y_test)

626     neutral
629     neutral
847    negative
514     neutral
365     neutral
         ...   
208    positive
777    negative
334     neutral
210    positive
350     neutral
Name: sentiment, Length: 201, dtype: object


In [52]:
def preprocess_data(review):
    # Now we set our stop words to be the predefined for English
    stop_words = set(stopwords.words('english'))

    # Cleaning links: Remove all the links contained in text
    text = re.sub(r'http\S+', '', review)
    
    # Cleaning everything except alphabetic and numerical characters
    text = re.sub("[^a-zA-Z0-9]"," ",review)
    
    # Tokenization
    token = word_tokenize(text)
    # tagged = pos_tag(token)
    # keep adjactive (JJ), adverb (RB) and verb (VB)
    # important_tokens = [w for w, pos in tagged if pos.startswith("JJ") or pos.startswith("RB") or pos.startswith("VB") or pos.startswith("NN")]

    # Now we manually remove all words from our tokenized text which are stop words...
    filtered_text = [] 
    for w in token:
        if w not in stop_words:
            filtered_text.append(w) # if the token is not a stop word, it will be appended into the list

    # the individual tokens in the list will be concatenated together into a single string using join()
    # these tokens is sepated by a space " "
    preprocessed_text = " ".join(filtered_text)
    return preprocessed_text

def sentiment_vader_lexicon(review, threshold=0.0, verbose=False):
    # analyze the sentiment for review
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)
    # get aggregate scores and final sentiment
    agg_score = scores['compound']
    final_sentiment = 'neutral'
    if agg_score > threshold+0.1:
        final_sentiment = 'positive'
    elif agg_score < threshold-0.1:
        final_sentiment = 'negative'
                                
    if verbose:
        # display detailed sentiment statistics
        final = round(agg_score, 2)
        sentiment_frame = pd.DataFrame([[final_sentiment, final]], 
                                        columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'], 
                                                                      ['Predicted Sentiment', 'Score']],
                                                              codes=[[0,0],[0,1]]))
        print(sentiment_frame)
    return final_sentiment

In [53]:
# Preprocessing full dataset
preprocessed_text_full = [preprocess_data(text) for text in reviews]

# Get sentiment classification of full dataset
predicted_sentiments_full = [sentiment_vader_lexicon(review, threshold=0.0, verbose=False) for review in preprocessed_text_full]

In [54]:
# Preprocessing test set
preprocessed_text_test = [preprocess_data(text) for text in X_test]

# Print sentiment classification result of test set
for review, sentiment in zip(preprocessed_text, y_test):
    print('Review:', review)
    print('Sentiment:', sentiment)
    pred = sentiment_vader_lexicon(review, threshold=0.0, verbose=True)    
    print('-'*60)

# Get sentiment classification of full dataset
predicted_sentiments_test = [sentiment_vader_lexicon(review, threshold=0.0, verbose=False) for review in preprocessed_text_test]

Review: 5 10 almost good ok
Sentiment: neutral
     SENTIMENT STATS:      
  Predicted Sentiment Score
0            positive  0.55
------------------------------------------------------------
Review: 4 9 10 IMDB 36 Rotten Tomatoes user score
Sentiment: neutral
     SENTIMENT STATS:      
  Predicted Sentiment Score
0            negative -0.51
------------------------------------------------------------
Review: movie bad surprise everyone hates
Sentiment: negative
     SENTIMENT STATS:      
  Predicted Sentiment Score
0            negative -0.65
------------------------------------------------------------
Review: Thunderbolts trailer saying captain America Cap trailer saying Steve Rogers
Sentiment: neutral
     SENTIMENT STATS:      
  Predicted Sentiment Score
0             neutral   0.0
------------------------------------------------------------
Review: hope movie lose heart two looks promising
Sentiment: neutral
     SENTIMENT STATS:      
  Predicted Sentiment Score
0            p

In [55]:
# Get report of classidication result
print("FULL DATASET")
print(classification_report(sentiments, predicted_sentiments_full))

print("\n TEST SET")
print(classification_report(y_test, predicted_sentiments_test))

FULL DATASET
              precision    recall  f1-score   support

    negative       0.62      0.57      0.60       341
     neutral       0.51      0.29      0.37       332
    positive       0.52      0.78      0.63       330

    accuracy                           0.55      1003
   macro avg       0.55      0.55      0.53      1003
weighted avg       0.55      0.55      0.53      1003


 TEST SET
              precision    recall  f1-score   support

    negative       0.65      0.61      0.63        72
     neutral       0.65      0.32      0.43        63
    positive       0.53      0.82      0.64        66

    accuracy                           0.59       201
   macro avg       0.61      0.58      0.57       201
weighted avg       0.61      0.59      0.57       201



In [56]:
# Predict sentiment and test model
reviews = []
reviews.append("this seem to be ok, not so intersting")
reviews.append("no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT!")
reviews.append("I don't care if some people voted this movie to be bad. If you want the Truth this is a Very Good Movie! It has every thing a movie should have. You really should Get this one")
reviews.append("Worst horror film ever but funniest film ever rolled in one you have got to see this film it is so cheap it is unbeliaveble but you have to see it really!!!! P.s watch the carrot")

for new_review in reviews:
    new_preprocessed = preprocess_data(new_review)
    
    print('Review:', new_review)
    pred = sentiment_vader_lexicon(new_preprocessed, threshold=0.0, verbose=True)    
    print('-'*60)

Review: this seem to be ok, not so intersting
     SENTIMENT STATS:      
  Predicted Sentiment Score
0            positive   0.3
------------------------------------------------------------
Review: no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT!
     SENTIMENT STATS:      
  Predicted Sentiment Score
0            negative -0.76
------------------------------------------------------------
Review: I don't care if some people voted this movie to be bad. If you want the Truth this is a Very Good Movie! It has every thing a movie should have. You really should Get this one
     SENTIMENT STATS:      
  Predicted Sentiment Score
0            positive  0.67
------------------------------------------------------------
Review: Worst horror film ever but funniest film ever rolled in one you have got to see this film it is so cheap it is unbeliaveble but you have to see it really!!!! P.s watch the carrot
     SENTIMENT STATS:      
  Predicted Senti