Assignment 2 - Extract the reviews from Dune 2021 using IMDB API
===

*Due: December 6 2022*

Assess which sentiment analysis method (AFINN, VADER, TEXTBLOB) and input (Raw, Preprocessed without stop words, Preprocessed with stop words) correlates better with the user rating from IMDB.

<img src="https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Fi0.wp.com%2Fnerdmuch.com%2Fwp-content%2Fuploads%2F2019%2F07%2Fdune-movie.jpg&f=1&nofb=1&ipt=9aca5a0f374877be1b804400befe5665f7e3dea66a3e0327f993b70573d47ece&ipo=images" style="width:800px;"/>

## Imports

In [59]:
import jupyter_black

jupyter_black.load()

import os
import pandas as pd
import contractions

import regex as re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import inflect
from afinn import Afinn
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob


p = inflect.engine()
stop_words = set(stopwords.words("english"))
lemma = WordNetLemmatizer()

vader_sia = SentimentIntensityAnalyzer()
afinn = Afinn()


data_path = "data/"

movie = "dune_2021"

## Functions

In [57]:
def clean_url(input):
    output = re.sub(r"http\S+", "", input)
    return output


def fix_contraction(input):
    output = contractions.fix(input)
    return output


def clean_non_alphanumeric(input):
    output = re.sub(r"[^a-zA-Z0-9]", " ", input)
    return output


def clean_tokenization(input):
    output = nltk.word_tokenize(input)
    return output


def clean_stopwords(input):
    output = [item for item in input if item not in stop_words]
    return output


def numbers_to_words(input):
    output = []
    for item in input:
        if item.isnumeric() == True:
            output += [p.number_to_words(item)]
        else:
            output += [item]
    return output


def clean_lowercase(input):
    output = str(input).lower()
    return output


def clean_lemmatization(input):
    output = [lemma.lemmatize(word=w, pos="v") for w in input]
    return output


def clean_length(input):
    output = [word for word in input if len(word) > 2]
    return output


def convert_to_string(input):
    output = " ".join(input)
    return output


def preprocessing(text, remove_stopwords=True):
    """
    Preprocessing pipeline.
    """
    text = clean_url(text)
    text = fix_contraction(text)
    text = clean_non_alphanumeric(text)
    text = clean_lowercase(text)
    text = clean_tokenization(text)
    text = numbers_to_words(text)
    if remove_stopwords:
        text = clean_stopwords(text)
    text = clean_lemmatization(text)
    text = clean_length(text)
    text = convert_to_string(text)
    return text

## Load reviews from disk

Download in Assignment 1

In [63]:
df = pd.read_csv(data_path + movie + ".csv")[["rating", "review"]].rename(
    columns={"review": "raw"}
)

print(df.head())
print(len(df), "total")

   rating                                                raw
0     4.5  Movie definitely is one of the greatest films ...
1     2.0  Like the score that crescendoes throughout, th...
2     4.0  So much like star wars and Lord of the rings m...
3     5.0  The best movie score ever composed! With the b...
4     4.5  If you like science fiction films, this is pre...
8780 total


## Add preprocessed reviews

In [64]:
df["pp"] = df.raw.apply(lambda x: preprocessing(x, False))
df["pp_no_stop"] = df.raw.apply(lambda x: preprocessing(x, True))

## Add sentiment scores

In [65]:
df["afinn_raw"] = df.raw.apply(lambda x: afinn.score(x))
df["vader_raw"] = df.raw.apply(lambda x: vader_sia.polarity_scores(x)["compound"])
df["textblob_raw"] = df.raw.apply(lambda x: TextBlob(x).sentiment.polarity)
df["afinn_pp"] = df.pp.apply(lambda x: afinn.score(x))
df["vader_pp"] = df.pp.apply(lambda x: vader_sia.polarity_scores(x)["compound"])
df["textblob_pp"] = df.pp.apply(lambda x: TextBlob(x).sentiment.polarity)
df["afinn_pp_no_stop"] = df.pp_no_stop.apply(lambda x: afinn.score(x))
df["vader_pp_no_stop"] = df.pp_no_stop.apply(
    lambda x: vader_sia.polarity_scores(x)["compound"]
)
df["textblob_pp_no_stop"] = df.pp_no_stop.apply(
    lambda x: TextBlob(x).sentiment.polarity
)

In [66]:
df.head()

Unnamed: 0,rating,raw,pp,pp_no_stop,afinn_raw,vader_raw,textblob_raw,afinn_pp,vader_pp,textblob_pp,afinn_pp_no_stop,vader_pp_no_stop,textblob_pp_no_stop
0,4.5,Movie definitely is one of the greatest films ...,movie definitely one the greatest film the dec...,movie definitely one greatest film decade surp...,3.0,0.7083,0.145312,3.0,0.7083,0.173214,3.0,0.8316,0.2425
1,2.0,"Like the score that crescendoes throughout, th...",like the score that crescendo throughout this ...,like score crescendo throughout movie overwrou...,-1.0,0.1655,0.075,-1.0,0.1655,0.075,-1.0,0.5994,0.075
2,4.0,So much like star wars and Lord of the rings m...,much like star war and lord the ring make this...,much like star war lord ring make movie amaze,6.0,0.4549,0.4,2.0,0.2732,0.2,2.0,0.2732,0.2
3,5.0,The best movie score ever composed! With the b...,the best movie score ever compose with the bes...,best movie score ever compose best edit ever m...,21.0,0.9879,0.8,21.0,0.9867,0.78,21.0,0.9867,0.78
4,4.5,"If you like science fiction films, this is pre...",you like science fiction film this pretty much...,like science fiction film pretty much must see...,11.0,0.9208,0.234621,11.0,0.941,0.238083,11.0,0.9623,0.185648


## Compute correlations with ratings

In [75]:
df.corr().sort_values("rating", ascending=False).iloc[1:, 0]

textblob_raw           0.526556
vader_raw              0.510415
vader_pp               0.481346
vader_pp_no_stop       0.445480
textblob_pp            0.426193
textblob_pp_no_stop    0.420752
afinn_raw              0.290606
afinn_pp               0.270597
afinn_pp_no_stop       0.267969
Name: rating, dtype: float64