## Rotten Tomatoes Movies Data Cleaning

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.casual import casual_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import ast
import requests
import warnings
from tqdm.notebook import tqdm
warnings.filterwarnings('ignore')
tqdm.pandas()

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  

In [3]:
df = pd.DataFrame(pd.read_csv('../data/rotten_tomatoes_movies.csv'))
df.head()

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,...,Sony Pictures Classics,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19
2,m/10,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,...,Waner Bros.,Fresh,67.0,24.0,Spilled,53.0,14684.0,2,16,8
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,...,Criterion Collection,Certified-Fresh,100.0,54.0,Upright,97.0,105386.0,6,54,0
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,...,Disney,Fresh,89.0,27.0,Upright,74.0,68918.0,5,24,3


In [4]:
df.shape

(17712, 22)

In [5]:
df.dtypes

rotten_tomatoes_link                 object
movie_title                          object
movie_info                           object
critics_consensus                    object
content_rating                       object
genres                               object
directors                            object
authors                              object
actors                               object
original_release_date                object
streaming_release_date               object
runtime                             float64
production_company                   object
tomatometer_status                   object
tomatometer_rating                  float64
tomatometer_count                   float64
audience_status                      object
audience_rating                     float64
audience_count                      float64
tomatometer_top_critics_count         int64
tomatometer_fresh_critics_count       int64
tomatometer_rotten_critics_count      int64
dtype: object

In [6]:
df.iloc[:,:11].head()

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,streaming_release_date
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2015-11-25
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,2012-09-04
2,m/10,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,2014-07-24
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,2017-01-13
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,2016-06-10


In [7]:
df.iloc[:,11:].head()

Unnamed: 0,runtime,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,119.0,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,90.0,Sony Pictures Classics,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19
2,122.0,Waner Bros.,Fresh,67.0,24.0,Spilled,53.0,14684.0,2,16,8
3,95.0,Criterion Collection,Certified-Fresh,100.0,54.0,Upright,97.0,105386.0,6,54,0
4,127.0,Disney,Fresh,89.0,27.0,Upright,74.0,68918.0,5,24,3


### Data Cleaning

In [8]:
df.isnull().sum()

rotten_tomatoes_link                   0
movie_title                            0
movie_info                           321
critics_consensus                   8578
content_rating                         0
genres                                19
directors                            194
authors                             1542
actors                               352
original_release_date               1166
streaming_release_date               384
runtime                              314
production_company                   499
tomatometer_status                    44
tomatometer_rating                    44
tomatometer_count                     44
audience_status                      448
audience_rating                      296
audience_count                       297
tomatometer_top_critics_count          0
tomatometer_fresh_critics_count        0
tomatometer_rotten_critics_count       0
dtype: int64

The most important attribute here is the `tomatometer_status` and `tomatometer_rating` as they are our "labelled" values. <br>
So, we will start with those.

In [9]:
df[['tomatometer_status', 'tomatometer_rating', 'tomatometer_count']].head()

Unnamed: 0,tomatometer_status,tomatometer_rating,tomatometer_count
0,Rotten,49.0,149.0
1,Certified-Fresh,87.0,142.0
2,Fresh,67.0,24.0
3,Certified-Fresh,100.0,54.0
4,Fresh,89.0,27.0


In [10]:
df[['tomatometer_status', 'tomatometer_rating', 'tomatometer_count']].tail()

Unnamed: 0,tomatometer_status,tomatometer_rating,tomatometer_count
17707,Rotten,56.0,9.0
17708,Certified-Fresh,98.0,291.0
17709,Fresh,80.0,10.0
17710,Fresh,96.0,23.0
17711,Rotten,50.0,8.0


In [11]:
df[['tomatometer_status', 'tomatometer_rating', 'tomatometer_count']].isnull().sum()

tomatometer_status    44
tomatometer_rating    44
tomatometer_count     44
dtype: int64

All these attributes have the same missing values.

In [12]:
df[(df['tomatometer_status'].isnull()) & (df['tomatometer_rating'].isnull()) & (df['tomatometer_count'].isnull())].isnull().sum()

rotten_tomatoes_link                 0
movie_title                          0
movie_info                          30
critics_consensus                   44
content_rating                       0
genres                               0
directors                            1
authors                              8
actors                               0
original_release_date               12
streaming_release_date              32
runtime                             30
production_company                   6
tomatometer_status                  44
tomatometer_rating                  44
tomatometer_count                   44
audience_status                     35
audience_rating                     35
audience_count                      35
tomatometer_top_critics_count        0
tomatometer_fresh_critics_count      0
tomatometer_rotten_critics_count     0
dtype: int64

In [13]:
df = df.dropna(subset=['tomatometer_status', 'tomatometer_rating', 'tomatometer_count'])
df.isnull().sum()

rotten_tomatoes_link                   0
movie_title                            0
movie_info                           291
critics_consensus                   8534
content_rating                         0
genres                                19
directors                            193
authors                             1534
actors                               352
original_release_date               1154
streaming_release_date               352
runtime                              284
production_company                   493
tomatometer_status                     0
tomatometer_rating                     0
tomatometer_count                      0
audience_status                      413
audience_rating                      261
audience_count                       262
tomatometer_top_critics_count          0
tomatometer_fresh_critics_count        0
tomatometer_rotten_critics_count       0
dtype: int64

Next, `original_release_date` and `streaming_release_date` are also important factors that allow us to peek into the time-series of the data. <br>
We will try to salvage as much as possible.

In [14]:
df[(df['original_release_date'].isnull()) & (df['streaming_release_date'].isnull())].isnull().sum()

rotten_tomatoes_link                  0
movie_title                           0
movie_info                          115
critics_consensus                    99
content_rating                        0
genres                                0
directors                             8
authors                              47
actors                               10
original_release_date               132
streaming_release_date              132
runtime                             106
production_company                   24
tomatometer_status                    0
tomatometer_rating                    0
tomatometer_count                     0
audience_status                      36
audience_rating                      36
audience_count                       36
tomatometer_top_critics_count         0
tomatometer_fresh_critics_count       0
tomatometer_rotten_critics_count      0
dtype: int64

There are only 132 rows where both values are missing together, meaning that a total of (1154 - 132) + 352 = 1374 rows will be lost.

In [15]:
df.shape

(17668, 22)

Comparing all the missing rows to the corpus shape, it is quite a significant proportion.

In [16]:
df = df.dropna(subset=['original_release_date', 'streaming_release_date'])
df.shape

(16294, 22)

In [17]:
df.isnull().sum()

rotten_tomatoes_link                   0
movie_title                            0
movie_info                            32
critics_consensus                   7478
content_rating                         0
genres                                 7
directors                            154
authors                             1262
actors                               255
original_release_date                  0
streaming_release_date                 0
runtime                               30
production_company                   315
tomatometer_status                     0
tomatometer_rating                     0
tomatometer_count                      0
audience_status                      280
audience_rating                      129
audience_count                       130
tomatometer_top_critics_count          0
tomatometer_fresh_critics_count        0
tomatometer_rotten_critics_count       0
dtype: int64

Back to numerical attributes, `audience` based attributes are also significant.

In [18]:
df[(df['audience_status'].isnull()) & (df['audience_rating'].isnull()) & (df['audience_count'].isnull())].isnull().sum()

rotten_tomatoes_link                  0
movie_title                           0
movie_info                            4
critics_consensus                   117
content_rating                        0
genres                                0
directors                             4
authors                              34
actors                               16
original_release_date                 0
streaming_release_date                0
runtime                               5
production_company                   19
tomatometer_status                    0
tomatometer_rating                    0
tomatometer_count                     0
audience_status                     129
audience_rating                     129
audience_count                      129
tomatometer_top_critics_count         0
tomatometer_fresh_critics_count       0
tomatometer_rotten_critics_count      0
dtype: int64

From rotten tomatoes themselves, a general audience rating less than 60% will result in a 'Spilled' label, with those 60% or more keeping an 'Upright' label. <br>
Thus, `audience_rating` and `audience_count` are more important than `audience_status` as the latter can be inferred. <br>
Hence, we only need to remove the former 2.

In [19]:
df = df.dropna(subset=['audience_rating', 'audience_count'])
df.isnull().sum()

rotten_tomatoes_link                   0
movie_title                            0
movie_info                            28
critics_consensus                   7361
content_rating                         0
genres                                 7
directors                            150
authors                             1228
actors                               239
original_release_date                  0
streaming_release_date                 0
runtime                               25
production_company                   296
tomatometer_status                     0
tomatometer_rating                     0
tomatometer_count                      0
audience_status                      151
audience_rating                        0
audience_count                         0
tomatometer_top_critics_count          0
tomatometer_fresh_critics_count        0
tomatometer_rotten_critics_count       0
dtype: int64

In [20]:
df['audience_status'].value_counts()

audience_status
Upright    8759
Spilled    7254
Name: count, dtype: int64

In [21]:
df['audience_status'] = df['audience_rating'].progress_apply(lambda x: 'Spilled' if x < 60 else 'Upright')
print(df['audience_status'].value_counts())
df.isnull().sum()

  0%|          | 0/16164 [00:00<?, ?it/s]

audience_status
Upright    8894
Spilled    7270
Name: count, dtype: int64


rotten_tomatoes_link                   0
movie_title                            0
movie_info                            28
critics_consensus                   7361
content_rating                         0
genres                                 7
directors                            150
authors                             1228
actors                               239
original_release_date                  0
streaming_release_date                 0
runtime                               25
production_company                   296
tomatometer_status                     0
tomatometer_rating                     0
tomatometer_count                      0
audience_status                        0
audience_rating                        0
audience_count                         0
tomatometer_top_critics_count          0
tomatometer_fresh_critics_count        0
tomatometer_rotten_critics_count       0
dtype: int64

The last important attribute will be `runtime` it is a important factor that could potentially be tied to the satisfaction of a movie viewer.

In [22]:
df = df.dropna(subset=['runtime'])
df.isnull().sum()

rotten_tomatoes_link                   0
movie_title                            0
movie_info                            17
critics_consensus                   7340
content_rating                         0
genres                                 7
directors                            150
authors                             1224
actors                               239
original_release_date                  0
streaming_release_date                 0
runtime                                0
production_company                   295
tomatometer_status                     0
tomatometer_rating                     0
tomatometer_count                      0
audience_status                        0
audience_rating                        0
audience_count                         0
tomatometer_top_critics_count          0
tomatometer_fresh_critics_count        0
tomatometer_rotten_critics_count       0
dtype: int64

The remaining attributes can have blank/filler values inputted into them.

In [23]:
df[['movie_info', 'critics_consensus', 'genres', 'directors', 'authors', 'actors', 'production_company']].head()

Unnamed: 0,movie_info,critics_consensus,genres,directors,authors,actors,production_company
0,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",20th Century Fox
1,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",Sony Pictures Classics
2,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",Waner Bros.
3,Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",Criterion Collection
4,"In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...","Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",Disney


In [24]:
df['movie_info'] = df['movie_info'].fillna('Unknown')
df['critics_consensus'] = df['critics_consensus'].fillna('No review')
df['genres'] = df['genres'].fillna('Unknown')
df['directors'] = df['directors'].fillna('Unknown')
df['authors'] = df['authors'].fillna('Unknown')
df['actors'] = df['actors'].fillna('Unknown')
df['production_company'] = df['production_company'].fillna('Unknown')
df.isnull().sum()

rotten_tomatoes_link                0
movie_title                         0
movie_info                          0
critics_consensus                   0
content_rating                      0
genres                              0
directors                           0
authors                             0
actors                              0
original_release_date               0
streaming_release_date              0
runtime                             0
production_company                  0
tomatometer_status                  0
tomatometer_rating                  0
tomatometer_count                   0
audience_status                     0
audience_rating                     0
audience_count                      0
tomatometer_top_critics_count       0
tomatometer_fresh_critics_count     0
tomatometer_rotten_critics_count    0
dtype: int64

### Data Processing

In [25]:
df.iloc[:,:11].head()

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,streaming_release_date
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2015-11-25
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,2012-09-04
2,m/10,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,2014-07-24
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,2017-01-13
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,2016-06-10


In [26]:
df.iloc[:,11:].head()

Unnamed: 0,runtime,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,119.0,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,90.0,Sony Pictures Classics,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19
2,122.0,Waner Bros.,Fresh,67.0,24.0,Spilled,53.0,14684.0,2,16,8
3,95.0,Criterion Collection,Certified-Fresh,100.0,54.0,Upright,97.0,105386.0,6,54,0
4,127.0,Disney,Fresh,89.0,27.0,Upright,74.0,68918.0,5,24,3


`movie_title`, `movie_info` and `critics_consensus`	are all text, thus they can be text processed. <br>
`genres`, `directors`, `authors` and `actors` are listed strings, these can be converted into arrays. <br>
`original_release_date` and `streaming_release_dates` are dates in strings, which can also be converted.

We will start with the easy ones of date conversion.

In [27]:
df['original_release_date'] = pd.to_datetime(df['original_release_date'])
df['streaming_release_date'] = pd.to_datetime(df['streaming_release_date'])
print(type(df['original_release_date'][0]))
print(type(df['streaming_release_date'][0]))

<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


Then, the to-be array attributes.

In [28]:
df['genres'] = df['genres'].progress_apply(lambda x: x.split(', '))
df['directors'] = df['directors'].progress_apply(lambda x: x.split(', '))
df['authors'] = df['authors'].progress_apply(lambda x: x.split(', '))
df['actors'] = df['actors'].progress_apply(lambda x: x.split(', '))
df['production_company'] = df['production_company'].progress_apply(lambda x: x.split(', '))
df[['genres', 'directors', 'authors', 'actors', 'production_company']].head()

  0%|          | 0/16139 [00:00<?, ?it/s]

  0%|          | 0/16139 [00:00<?, ?it/s]

  0%|          | 0/16139 [00:00<?, ?it/s]

  0%|          | 0/16139 [00:00<?, ?it/s]

  0%|          | 0/16139 [00:00<?, ?it/s]

Unnamed: 0,genres,directors,authors,actors,production_company
0,"[Action & Adventure, Comedy, Drama, Science Fi...",[Chris Columbus],"[Craig Titley, Chris Columbus, Rick Riordan]","[Logan Lerman, Brandon T. Jackson, Alexandra D...",[20th Century Fox]
1,[Comedy],[Nicole Holofcener],[Nicole Holofcener],"[Catherine Keener, Amanda Peet, Oliver Platt, ...",[Sony Pictures Classics]
2,"[Comedy, Romance]",[Blake Edwards],[Blake Edwards],"[Dudley Moore, Bo Derek, Julie Andrews, Robert...",[Waner Bros.]
3,"[Classics, Drama]",[Sidney Lumet],[Reginald Rose],"[Martin Balsam, John Fiedler, Lee J. Cobb, E.G...",[Criterion Collection]
4,"[Action & Adventure, Drama, Kids & Family]",[Richard Fleischer],[Earl Felton],"[James Mason, Kirk Douglas, Paul Lukas, Peter ...",[Disney]


Lastly, the text procssing.

In [29]:
nltk.download('punkt', download_dir='./nltk_data', quiet=True)
nltk.download('punkt_tab', download_dir='./nltk_data', quiet=True)
nltk.download('stopwords', download_dir='./nltk_data', quiet=True)
nltk.download('wordnet', download_dir='./nltk_data', quiet=True)
nltk.data.path.append('./nltk_data')

In [30]:
lemmatizer = WordNetLemmatizer()
stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
stopwords = set(stopwords_list.decode().splitlines())

In [31]:
df['tokenized_movie_title'] = df['movie_title'].progress_apply(lambda x: casual_tokenize(x.lower()))
df['tokenized_movie_info'] = df['movie_info'].progress_apply(lambda x: casual_tokenize(x.lower()))
df['tokenized_critics_consensus'] = df['critics_consensus'].progress_apply(lambda x: casual_tokenize(x.lower()))
df[['tokenized_movie_title', 'tokenized_movie_info', 'tokenized_critics_consensus']].head()

  0%|          | 0/16139 [00:00<?, ?it/s]

  0%|          | 0/16139 [00:00<?, ?it/s]

  0%|          | 0/16139 [00:00<?, ?it/s]

Unnamed: 0,tokenized_movie_title,tokenized_movie_info,tokenized_critics_consensus
0,"[percy, jackson, &, the, olympians, :, the, li...","[always, trouble-prone, ,, the, life, of, teen...","[though, it, may, seem, like, just, another, h..."
1,"[please, give]","[kate, (, catherine, keener, ), and, her, husb...","[nicole, holofcener's, newest, might, seem, sl..."
2,[10],"[a, successful, ,, middle-aged, hollywood, son...","[blake, edwards, ', bawdy, comedy, may, not, s..."
3,"[12, angry, men, (, twelve, angry, men, )]","[following, the, closing, arguments, in, a, mu...","[sidney, lumet's, feature, debut, is, a, super..."
4,"[20,000, leagues, under, the, sea]","[in, 1866, ,, professor, pierre, m, ., aronnax...","[one, of, disney's, finest, live-action, adven..."


In [32]:
df['alphanum_movie_title'] = df['tokenized_movie_title'].progress_apply(lambda x: [word for word in x if word.isalnum()])
df['alphanum_movie_info'] = df['tokenized_movie_info'].progress_apply(lambda x: [word for word in x if word.isalnum()])
df['alphanum_critics_consensus'] = df['tokenized_critics_consensus'].progress_apply(lambda x: [word for word in x if word.isalnum()])
df[['alphanum_movie_title', 'alphanum_movie_info', 'alphanum_critics_consensus']].head()

  0%|          | 0/16139 [00:00<?, ?it/s]

  0%|          | 0/16139 [00:00<?, ?it/s]

  0%|          | 0/16139 [00:00<?, ?it/s]

Unnamed: 0,alphanum_movie_title,alphanum_movie_info,alphanum_critics_consensus
0,"[percy, jackson, the, olympians, the, lightnin...","[always, the, life, of, teenager, percy, jacks...","[though, it, may, seem, like, just, another, h..."
1,"[please, give]","[kate, catherine, keener, and, her, husband, a...","[nicole, newest, might, seem, slight, in, plac..."
2,[10],"[a, successful, hollywood, songwriter, falls, ...","[blake, edwards, bawdy, comedy, may, not, scor..."
3,"[12, angry, men, twelve, angry, men]","[following, the, closing, arguments, in, a, mu...","[sidney, feature, debut, is, a, superbly, writ..."
4,"[leagues, under, the, sea]","[in, 1866, professor, pierre, m, aronnax, paul...","[one, of, finest, adventures, leagues, under, ..."


In [33]:
df['stopword_removed_movie_title'] = df['alphanum_movie_title'].progress_apply(lambda x: [word for word in x if word not in stopwords])
df['stopword_removed_movie_info'] = df['alphanum_movie_info'].progress_apply(lambda x: [word for word in x if word not in stopwords])
df['stopword_removed_critics_consensus'] = df['alphanum_critics_consensus'].progress_apply(lambda x: [word for word in x if word not in stopwords])
df[['stopword_removed_movie_title', 'stopword_removed_movie_info', 'stopword_removed_critics_consensus']].head()

  0%|          | 0/16139 [00:00<?, ?it/s]

  0%|          | 0/16139 [00:00<?, ?it/s]

  0%|          | 0/16139 [00:00<?, ?it/s]

Unnamed: 0,stopword_removed_movie_title,stopword_removed_movie_info,stopword_removed_critics_consensus
0,"[percy, jackson, olympians, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene..."
1,[],"[kate, catherine, keener, husband, alex, olive...","[nicole, newest, slight, places, rendering, co..."
2,[10],"[successful, hollywood, songwriter, falls, hop...","[blake, edwards, bawdy, comedy, score, perfect..."
3,"[12, angry, men, angry, men]","[closing, arguments, murder, trial, 12, member...","[sidney, feature, debut, superbly, written, dr..."
4,"[leagues, sea]","[1866, professor, pierre, aronnax, paul, lukas...","[finest, adventures, leagues, sea, brings, jul..."


In [34]:
df['lemmatized_movie_title'] = df['stopword_removed_movie_title'].progress_apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df['lemmatized_movie_info'] = df['stopword_removed_movie_info'].progress_apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df['lemmatized_critics_consensus'] = df['stopword_removed_critics_consensus'].progress_apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df[['lemmatized_movie_title', 'lemmatized_movie_info', 'lemmatized_critics_consensus']].head()

  0%|          | 0/16139 [00:00<?, ?it/s]

  0%|          | 0/16139 [00:00<?, ?it/s]

  0%|          | 0/16139 [00:00<?, ?it/s]

Unnamed: 0,lemmatized_movie_title,lemmatized_movie_info,lemmatized_critics_consensus
0,"[percy, jackson, olympian, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene..."
1,[],"[kate, catherine, keener, husband, alex, olive...","[nicole, newest, slight, place, rendering, com..."
2,[10],"[successful, hollywood, songwriter, fall, hope...","[blake, edward, bawdy, comedy, score, perfect,..."
3,"[12, angry, men, angry, men]","[closing, argument, murder, trial, 12, member,...","[sidney, feature, debut, superbly, written, dr..."
4,"[league, sea]","[1866, professor, pierre, aronnax, paul, lukas...","[finest, adventure, league, sea, brings, jules..."


### VADER

In [35]:
sid_obj = SentimentIntensityAnalyzer()

In [36]:
df['sentiment_scores'] = df['critics_consensus'].progress_apply(lambda x: sid_obj.polarity_scores(x))
df['compound_score'] = df['sentiment_scores'].progress_apply(lambda x: x['compound'])
df['sentiment'] = df['compound_score'].progress_apply(lambda x: 
                                                "Positive" if x >= 0.05 else
                                                ("Negative" if x <= -0.05 else "Neutral"))
df.head()

  0%|          | 0/16139 [00:00<?, ?it/s]

  0%|          | 0/16139 [00:00<?, ?it/s]

  0%|          | 0/16139 [00:00<?, ?it/s]

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,alphanum_critics_consensus,stopword_removed_movie_title,stopword_removed_movie_info,stopword_removed_critics_consensus,lemmatized_movie_title,lemmatized_movie_info,lemmatized_critics_consensus,sentiment_scores,compound_score,sentiment
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"[Action & Adventure, Comedy, Drama, Science Fi...",[Chris Columbus],"[Craig Titley, Chris Columbus, Rick Riordan]","[Logan Lerman, Brandon T. Jackson, Alexandra D...",2010-02-12,...,"[though, it, may, seem, like, just, another, h...","[percy, jackson, olympians, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","[percy, jackson, olympian, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","{'neg': 0.0, 'neu': 0.612, 'pos': 0.388, 'comp...",0.9274,Positive
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,[Comedy],[Nicole Holofcener],[Nicole Holofcener],"[Catherine Keener, Amanda Peet, Oliver Platt, ...",2010-04-30,...,"[nicole, newest, might, seem, slight, in, plac...",[],"[kate, catherine, keener, husband, alex, olive...","[nicole, newest, slight, places, rendering, co...",[],"[kate, catherine, keener, husband, alex, olive...","[nicole, newest, slight, place, rendering, com...","{'neg': 0.0, 'neu': 0.889, 'pos': 0.111, 'comp...",0.5023,Positive
2,m/10,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"[Comedy, Romance]",[Blake Edwards],[Blake Edwards],"[Dudley Moore, Bo Derek, Julie Andrews, Robert...",1979-10-05,...,"[blake, edwards, bawdy, comedy, may, not, scor...",[10],"[successful, hollywood, songwriter, falls, hop...","[blake, edwards, bawdy, comedy, score, perfect...",[10],"[successful, hollywood, songwriter, fall, hope...","[blake, edward, bawdy, comedy, score, perfect,...","{'neg': 0.253, 'neu': 0.562, 'pos': 0.185, 'co...",-0.4676,Negative
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"[Classics, Drama]",[Sidney Lumet],[Reginald Rose],"[Martin Balsam, John Fiedler, Lee J. Cobb, E.G...",1957-04-13,...,"[sidney, feature, debut, is, a, superbly, writ...","[12, angry, men, angry, men]","[closing, arguments, murder, trial, 12, member...","[sidney, feature, debut, superbly, written, dr...","[12, angry, men, angry, men]","[closing, argument, murder, trial, 12, member,...","[sidney, feature, debut, superbly, written, dr...","{'neg': 0.0, 'neu': 0.791, 'pos': 0.209, 'comp...",0.5423,Positive
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"[Action & Adventure, Drama, Kids & Family]",[Richard Fleischer],[Earl Felton],"[James Mason, Kirk Douglas, Paul Lukas, Peter ...",1954-01-01,...,"[one, of, finest, adventures, leagues, under, ...","[leagues, sea]","[1866, professor, pierre, aronnax, paul, lukas...","[finest, adventures, leagues, sea, brings, jul...","[league, sea]","[1866, professor, pierre, aronnax, paul, lukas...","[finest, adventure, league, sea, brings, jules...","{'neg': 0.0, 'neu': 0.787, 'pos': 0.213, 'comp...",0.7579,Positive


### Row Expansion

In [37]:
df_exploded = df.explode('genres').explode('directors').explode('authors').explode('actors').explode('production_company')
df_exploded.head()

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,alphanum_critics_consensus,stopword_removed_movie_title,stopword_removed_movie_info,stopword_removed_critics_consensus,lemmatized_movie_title,lemmatized_movie_info,lemmatized_critics_consensus,sentiment_scores,compound_score,sentiment
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,Action & Adventure,Chris Columbus,Craig Titley,Logan Lerman,2010-02-12,...,"[though, it, may, seem, like, just, another, h...","[percy, jackson, olympians, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","[percy, jackson, olympian, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","{'neg': 0.0, 'neu': 0.612, 'pos': 0.388, 'comp...",0.9274,Positive
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,Action & Adventure,Chris Columbus,Craig Titley,Brandon T. Jackson,2010-02-12,...,"[though, it, may, seem, like, just, another, h...","[percy, jackson, olympians, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","[percy, jackson, olympian, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","{'neg': 0.0, 'neu': 0.612, 'pos': 0.388, 'comp...",0.9274,Positive
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,Action & Adventure,Chris Columbus,Craig Titley,Alexandra Daddario,2010-02-12,...,"[though, it, may, seem, like, just, another, h...","[percy, jackson, olympians, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","[percy, jackson, olympian, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","{'neg': 0.0, 'neu': 0.612, 'pos': 0.388, 'comp...",0.9274,Positive
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,Action & Adventure,Chris Columbus,Craig Titley,Jake Abel,2010-02-12,...,"[though, it, may, seem, like, just, another, h...","[percy, jackson, olympians, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","[percy, jackson, olympian, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","{'neg': 0.0, 'neu': 0.612, 'pos': 0.388, 'comp...",0.9274,Positive
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,Action & Adventure,Chris Columbus,Craig Titley,Sean Bean,2010-02-12,...,"[though, it, may, seem, like, just, another, h...","[percy, jackson, olympians, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","[percy, jackson, olympian, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","{'neg': 0.0, 'neu': 0.612, 'pos': 0.388, 'comp...",0.9274,Positive


In [38]:
# Identify the first occurrence in each group
first_occurrence = df_exploded.groupby('movie_title').cumcount() == 0

# Use the mask to set numerical attributes to NaN where it's not the first occurrence
numerical_attributes = df_exploded.select_dtypes(include=[np.number]).columns
df_exploded.loc[~first_occurrence, numerical_attributes] = np.nan

In [39]:
df_exploded.shape

(2249720, 37)

In [40]:
print("Transformed DataFrame:")
df_exploded.head()

Transformed DataFrame:


Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,alphanum_critics_consensus,stopword_removed_movie_title,stopword_removed_movie_info,stopword_removed_critics_consensus,lemmatized_movie_title,lemmatized_movie_info,lemmatized_critics_consensus,sentiment_scores,compound_score,sentiment
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,Action & Adventure,Chris Columbus,Craig Titley,Logan Lerman,2010-02-12,...,"[though, it, may, seem, like, just, another, h...","[percy, jackson, olympians, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","[percy, jackson, olympian, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","{'neg': 0.0, 'neu': 0.612, 'pos': 0.388, 'comp...",0.9274,Positive
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,Action & Adventure,Chris Columbus,Craig Titley,Brandon T. Jackson,2010-02-12,...,"[though, it, may, seem, like, just, another, h...","[percy, jackson, olympians, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","[percy, jackson, olympian, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","{'neg': 0.0, 'neu': 0.612, 'pos': 0.388, 'comp...",,Positive
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,Action & Adventure,Chris Columbus,Craig Titley,Alexandra Daddario,2010-02-12,...,"[though, it, may, seem, like, just, another, h...","[percy, jackson, olympians, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","[percy, jackson, olympian, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","{'neg': 0.0, 'neu': 0.612, 'pos': 0.388, 'comp...",,Positive
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,Action & Adventure,Chris Columbus,Craig Titley,Jake Abel,2010-02-12,...,"[though, it, may, seem, like, just, another, h...","[percy, jackson, olympians, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","[percy, jackson, olympian, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","{'neg': 0.0, 'neu': 0.612, 'pos': 0.388, 'comp...",,Positive
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,Action & Adventure,Chris Columbus,Craig Titley,Sean Bean,2010-02-12,...,"[though, it, may, seem, like, just, another, h...","[percy, jackson, olympians, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","[percy, jackson, olympian, lightning, thief]","[life, teenager, percy, jackson, logan, lerman...","[harry, potter, knockoff, percy, jackson, bene...","{'neg': 0.0, 'neu': 0.612, 'pos': 0.388, 'comp...",,Positive


### Feature Selection

In [41]:
df_exploded.columns

Index(['rotten_tomatoes_link', 'movie_title', 'movie_info',
       'critics_consensus', 'content_rating', 'genres', 'directors', 'authors',
       'actors', 'original_release_date', 'streaming_release_date', 'runtime',
       'production_company', 'tomatometer_status', 'tomatometer_rating',
       'tomatometer_count', 'audience_status', 'audience_rating',
       'audience_count', 'tomatometer_top_critics_count',
       'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count',
       'tokenized_movie_title', 'tokenized_movie_info',
       'tokenized_critics_consensus', 'alphanum_movie_title',
       'alphanum_movie_info', 'alphanum_critics_consensus',
       'stopword_removed_movie_title', 'stopword_removed_movie_info',
       'stopword_removed_critics_consensus', 'lemmatized_movie_title',
       'lemmatized_movie_info', 'lemmatized_critics_consensus',
       'sentiment_scores', 'compound_score', 'sentiment'],
      dtype='object')

In [42]:
df2 = df_exploded[['movie_title', 'content_rating', 'genres', 'directors', 'authors', 'actors', 'production_company', 'tomatometer_status', 'tomatometer_rating', 'tomatometer_count', 'audience_status', 'audience_rating', 'audience_count', 'runtime', 'original_release_date', 'streaming_release_date', 'sentiment', 'compound_score']]

In [43]:
df2.to_csv('../data/rotten_tomatoes_movies_tableau.csv', index=False)

In [44]:
df.to_csv('../data/rotten_tomatoes_movies_cleaned.csv', index=False)