## Rotten Tomatoes Critic Reviews Data Processing

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.casual import casual_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import ast
import requests
import warnings
from tqdm.notebook import tqdm
warnings.filterwarnings('ignore')
tqdm.pandas()

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  

In [3]:
df = pd.DataFrame(pd.read_csv('../data/rotten_tomatoes_critic_reviews.csv'))
df.head()

Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content
0,m/0814255,Andrew L. Urban,False,Urban Cinefile,Fresh,,2010-02-06,A fantasy adventure that fuses Greek mythology...
1,m/0814255,Louise Keller,False,Urban Cinefile,Fresh,,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,m/0814255,,False,FILMINK (Australia),Fresh,,2010-02-09,With a top-notch cast and dazzling special eff...
3,m/0814255,Ben McEachen,False,Sunday Mail (Australia),Fresh,3.5/5,2010-02-09,Whether audiences will get behind The Lightnin...
4,m/0814255,Ethan Alter,True,Hollywood Reporter,Rotten,,2010-02-10,What's really lacking in The Lightning Thief i...


In [4]:
df.dtypes

rotten_tomatoes_link    object
critic_name             object
top_critic                bool
publisher_name          object
review_type             object
review_score            object
review_date             object
review_content          object
dtype: object

In [5]:
df.shape

(1130017, 8)

### Data Cleaning

In [6]:
df.isnull().sum()

rotten_tomatoes_link         0
critic_name              18529
top_critic                   0
publisher_name               0
review_type                  0
review_score            305936
review_date                  0
review_content           65806
dtype: int64

`critic_name` is not very useful to the project as we do not need to know who wrote the review, just the sentiment. <br>
`review_content`, while important, is not a must to have, and we can fill it in with blank values.

In [7]:
df['review_content'] = df['review_content'].fillna('No review')

In [8]:
df['review_score'].value_counts()

review_score
3/5        90273
4/5        83659
3/4        72366
2/5        60174
2/4        47546
           ...  
69/70          1
2.5/20         1
5.55/10        1
35             1
9.56/10        1
Name: count, Length: 814, dtype: int64

In [9]:
len(df['review_score'].value_counts())

814

There are 814 unique scores, which is bizzare. <br>
We cannot remove all the rows as it contributes to a significant portion of the data (Roughly 30%). Thus, we cannot use this data.

In [10]:
df2 = df[['rotten_tomatoes_link', 'publisher_name', 'top_critic', 'review_type', 'review_date', 'review_content']]
df2.head()

Unnamed: 0,rotten_tomatoes_link,publisher_name,top_critic,review_type,review_date,review_content
0,m/0814255,Urban Cinefile,False,Fresh,2010-02-06,A fantasy adventure that fuses Greek mythology...
1,m/0814255,Urban Cinefile,False,Fresh,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,m/0814255,FILMINK (Australia),False,Fresh,2010-02-09,With a top-notch cast and dazzling special eff...
3,m/0814255,Sunday Mail (Australia),False,Fresh,2010-02-09,Whether audiences will get behind The Lightnin...
4,m/0814255,Hollywood Reporter,True,Rotten,2010-02-10,What's really lacking in The Lightning Thief i...


In [11]:
df2['review_date'] = pd.to_datetime(df2['review_date'])
type(df2['review_date'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [12]:
nltk.download('punkt', download_dir='./nltk_data', quiet=True)
nltk.download('punkt_tab', download_dir='./nltk_data', quiet=True)
nltk.download('stopwords', download_dir='./nltk_data', quiet=True)
nltk.download('wordnet', download_dir='./nltk_data', quiet=True)
nltk.data.path.append('./nltk_data')

In [13]:
# stop_words = set(stopwords.words('english'))
# stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
stopwords = set(stopwords_list.decode().splitlines())

In [14]:
df2['tokenized_review_content'] = df2['review_content'].progress_apply(lambda x: casual_tokenize(x.lower(), reduce_len=True, strip_handles=False))
df2.head()

  0%|          | 0/1130017 [00:00<?, ?it/s]

Unnamed: 0,rotten_tomatoes_link,publisher_name,top_critic,review_type,review_date,review_content,tokenized_review_content
0,m/0814255,Urban Cinefile,False,Fresh,2010-02-06,A fantasy adventure that fuses Greek mythology...,"[a, fantasy, adventure, that, fuses, greek, my..."
1,m/0814255,Urban Cinefile,False,Fresh,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff...","[uma, thurman, as, medusa, ,, the, gorgon, wit..."
2,m/0814255,FILMINK (Australia),False,Fresh,2010-02-09,With a top-notch cast and dazzling special eff...,"[with, a, top-notch, cast, and, dazzling, spec..."
3,m/0814255,Sunday Mail (Australia),False,Fresh,2010-02-09,Whether audiences will get behind The Lightnin...,"[whether, audiences, will, get, behind, the, l..."
4,m/0814255,Hollywood Reporter,True,Rotten,2010-02-10,What's really lacking in The Lightning Thief i...,"[what's, really, lacking, in, the, lightning, ..."


In [15]:
df2['alphanum_review_content'] = df2['tokenized_review_content'].progress_apply(lambda x: [word for word in x if word.isalnum()])
df2.head()

  0%|          | 0/1130017 [00:00<?, ?it/s]

Unnamed: 0,rotten_tomatoes_link,publisher_name,top_critic,review_type,review_date,review_content,tokenized_review_content,alphanum_review_content
0,m/0814255,Urban Cinefile,False,Fresh,2010-02-06,A fantasy adventure that fuses Greek mythology...,"[a, fantasy, adventure, that, fuses, greek, my...","[a, fantasy, adventure, that, fuses, greek, my..."
1,m/0814255,Urban Cinefile,False,Fresh,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff...","[uma, thurman, as, medusa, ,, the, gorgon, wit...","[uma, thurman, as, medusa, the, gorgon, with, ..."
2,m/0814255,FILMINK (Australia),False,Fresh,2010-02-09,With a top-notch cast and dazzling special eff...,"[with, a, top-notch, cast, and, dazzling, spec...","[with, a, cast, and, dazzling, special, effect..."
3,m/0814255,Sunday Mail (Australia),False,Fresh,2010-02-09,Whether audiences will get behind The Lightnin...,"[whether, audiences, will, get, behind, the, l...","[whether, audiences, will, get, behind, the, l..."
4,m/0814255,Hollywood Reporter,True,Rotten,2010-02-10,What's really lacking in The Lightning Thief i...,"[what's, really, lacking, in, the, lightning, ...","[really, lacking, in, the, lightning, thief, i..."


In [16]:
df2['stopword_removed_review_content'] = df2['alphanum_review_content'].progress_apply(lambda x: [word for word in x if word not in stopwords])
df2.head()

  0%|          | 0/1130017 [00:00<?, ?it/s]

Unnamed: 0,rotten_tomatoes_link,publisher_name,top_critic,review_type,review_date,review_content,tokenized_review_content,alphanum_review_content,stopword_removed_review_content
0,m/0814255,Urban Cinefile,False,Fresh,2010-02-06,A fantasy adventure that fuses Greek mythology...,"[a, fantasy, adventure, that, fuses, greek, my...","[a, fantasy, adventure, that, fuses, greek, my...","[fantasy, adventure, fuses, greek, mythology, ..."
1,m/0814255,Urban Cinefile,False,Fresh,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff...","[uma, thurman, as, medusa, ,, the, gorgon, wit...","[uma, thurman, as, medusa, the, gorgon, with, ...","[uma, thurman, medusa, gorgon, coiffure, writh..."
2,m/0814255,FILMINK (Australia),False,Fresh,2010-02-09,With a top-notch cast and dazzling special eff...,"[with, a, top-notch, cast, and, dazzling, spec...","[with, a, cast, and, dazzling, special, effect...","[cast, dazzling, special, effects, tide, teens..."
3,m/0814255,Sunday Mail (Australia),False,Fresh,2010-02-09,Whether audiences will get behind The Lightnin...,"[whether, audiences, will, get, behind, the, l...","[whether, audiences, will, get, behind, the, l...","[audiences, lightning, thief, hard, predict, e..."
4,m/0814255,Hollywood Reporter,True,Rotten,2010-02-10,What's really lacking in The Lightning Thief i...,"[what's, really, lacking, in, the, lightning, ...","[really, lacking, in, the, lightning, thief, i...","[lacking, lightning, thief, genuine, sense, th..."


In [17]:
df2['lemmatized_review_content'] = df2['stopword_removed_review_content'].progress_apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df2.head()

  0%|          | 0/1130017 [00:00<?, ?it/s]

Unnamed: 0,rotten_tomatoes_link,publisher_name,top_critic,review_type,review_date,review_content,tokenized_review_content,alphanum_review_content,stopword_removed_review_content,lemmatized_review_content
0,m/0814255,Urban Cinefile,False,Fresh,2010-02-06,A fantasy adventure that fuses Greek mythology...,"[a, fantasy, adventure, that, fuses, greek, my...","[a, fantasy, adventure, that, fuses, greek, my...","[fantasy, adventure, fuses, greek, mythology, ...","[fantasy, adventure, fuse, greek, mythology, c..."
1,m/0814255,Urban Cinefile,False,Fresh,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff...","[uma, thurman, as, medusa, ,, the, gorgon, wit...","[uma, thurman, as, medusa, the, gorgon, with, ...","[uma, thurman, medusa, gorgon, coiffure, writh...","[uma, thurman, medusa, gorgon, coiffure, writh..."
2,m/0814255,FILMINK (Australia),False,Fresh,2010-02-09,With a top-notch cast and dazzling special eff...,"[with, a, top-notch, cast, and, dazzling, spec...","[with, a, cast, and, dazzling, special, effect...","[cast, dazzling, special, effects, tide, teens...","[cast, dazzling, special, effect, tide, teen, ..."
3,m/0814255,Sunday Mail (Australia),False,Fresh,2010-02-09,Whether audiences will get behind The Lightnin...,"[whether, audiences, will, get, behind, the, l...","[whether, audiences, will, get, behind, the, l...","[audiences, lightning, thief, hard, predict, e...","[audience, lightning, thief, hard, predict, en..."
4,m/0814255,Hollywood Reporter,True,Rotten,2010-02-10,What's really lacking in The Lightning Thief i...,"[what's, really, lacking, in, the, lightning, ...","[really, lacking, in, the, lightning, thief, i...","[lacking, lightning, thief, genuine, sense, th...","[lacking, lightning, thief, genuine, sense, th..."


### VADER

In [18]:
sid_obj = SentimentIntensityAnalyzer()

In [19]:
df2['sentiment_scores'] = df2['review_content'].progress_apply(lambda x: sid_obj.polarity_scores(x))
df2['compound_score'] = df2['sentiment_scores'].progress_apply(lambda x: x['compound'])
df2['sentiment'] = df2['compound_score'].progress_apply(lambda x: 
                                                "Positive" if x > 0 else
                                                ("Negative" if x < 0 else "Neutral"))
df2.head()

  0%|          | 0/1130017 [00:00<?, ?it/s]

  0%|          | 0/1130017 [00:00<?, ?it/s]

  0%|          | 0/1130017 [00:00<?, ?it/s]

Unnamed: 0,rotten_tomatoes_link,publisher_name,top_critic,review_type,review_date,review_content,tokenized_review_content,alphanum_review_content,stopword_removed_review_content,lemmatized_review_content,sentiment_scores,compound_score,sentiment
0,m/0814255,Urban Cinefile,False,Fresh,2010-02-06,A fantasy adventure that fuses Greek mythology...,"[a, fantasy, adventure, that, fuses, greek, my...","[a, fantasy, adventure, that, fuses, greek, my...","[fantasy, adventure, fuses, greek, mythology, ...","[fantasy, adventure, fuse, greek, mythology, c...","{'neg': 0.0, 'neu': 0.776, 'pos': 0.224, 'comp...",0.7579,Positive
1,m/0814255,Urban Cinefile,False,Fresh,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff...","[uma, thurman, as, medusa, ,, the, gorgon, wit...","[uma, thurman, as, medusa, the, gorgon, with, ...","[uma, thurman, medusa, gorgon, coiffure, writh...","[uma, thurman, medusa, gorgon, coiffure, writh...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,Neutral
2,m/0814255,FILMINK (Australia),False,Fresh,2010-02-09,With a top-notch cast and dazzling special eff...,"[with, a, top-notch, cast, and, dazzling, spec...","[with, a, cast, and, dazzling, special, effect...","[cast, dazzling, special, effects, tide, teens...","[cast, dazzling, special, effect, tide, teen, ...","{'neg': 0.0, 'neu': 0.876, 'pos': 0.124, 'comp...",0.4019,Positive
3,m/0814255,Sunday Mail (Australia),False,Fresh,2010-02-09,Whether audiences will get behind The Lightnin...,"[whether, audiences, will, get, behind, the, l...","[whether, audiences, will, get, behind, the, l...","[audiences, lightning, thief, hard, predict, e...","[audience, lightning, thief, hard, predict, en...","{'neg': 0.078, 'neu': 0.733, 'pos': 0.189, 'co...",0.705,Positive
4,m/0814255,Hollywood Reporter,True,Rotten,2010-02-10,What's really lacking in The Lightning Thief i...,"[what's, really, lacking, in, the, lightning, ...","[really, lacking, in, the, lightning, thief, i...","[lacking, lightning, thief, genuine, sense, th...","[lacking, lightning, thief, genuine, sense, th...","{'neg': 0.12, 'neu': 0.88, 'pos': 0.0, 'compou...",-0.5267,Negative


In [20]:
df3 = df2[['rotten_tomatoes_link', 'publisher_name', 'top_critic', 'review_type', 'review_date', 'sentiment', 'compound_score']]
df3.to_csv('../data/rotten_tomatoes_critic_reviews_tableau.csv', index=False)

In [21]:
df4 = df2[['top_critic', 'review_type', 'review_date', 'sentiment', 'compound_score', 'lemmatized_review_content']]
df4.to_csv('../data/rotten_tomatoes_critic_reviews_cleaned.csv', index=False)