In [1]:
# Import dependencies
import pandas as pd

In [2]:
# Load data
movies_df = pd.read_csv("rotten_tomatoes_movies.csv")
movies_df.head()

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,...,Sony Pictures Classics,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19
2,m/10,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,...,Waner Bros.,Fresh,67.0,24.0,Spilled,53.0,14684.0,2,16,8
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,...,Criterion Collection,Certified-Fresh,100.0,54.0,Upright,97.0,105386.0,6,54,0
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,...,Disney,Fresh,89.0,27.0,Upright,74.0,68918.0,5,24,3


In [3]:
# Get info about columns
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17712 entries, 0 to 17711
Data columns (total 22 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   rotten_tomatoes_link              17712 non-null  object 
 1   movie_title                       17712 non-null  object 
 2   movie_info                        17391 non-null  object 
 3   critics_consensus                 9134 non-null   object 
 4   content_rating                    17712 non-null  object 
 5   genres                            17693 non-null  object 
 6   directors                         17518 non-null  object 
 7   authors                           16170 non-null  object 
 8   actors                            17360 non-null  object 
 9   original_release_date             16546 non-null  object 
 10  streaming_release_date            17328 non-null  object 
 11  runtime                           17398 non-null  float64
 12  prod

In [4]:
# Drop critics_consensus column and null values
movies_df_clean = movies_df.drop("critics_consensus", axis=1).dropna()

# Replace "Certified-Fresh" with "Fresh" status to have binary classification target
movies_df_clean["tomatometer_status"] = movies_df_clean["tomatometer_status"].replace("Certified-Fresh","Fresh")

print(movies_df_clean.shape)
movies_df_clean.isnull().sum()

(14437, 21)


rotten_tomatoes_link                0
movie_title                         0
movie_info                          0
content_rating                      0
genres                              0
directors                           0
authors                             0
actors                              0
original_release_date               0
streaming_release_date              0
runtime                             0
production_company                  0
tomatometer_status                  0
tomatometer_rating                  0
tomatometer_count                   0
audience_status                     0
audience_rating                     0
audience_count                      0
tomatometer_top_critics_count       0
tomatometer_fresh_critics_count     0
tomatometer_rotten_critics_count    0
dtype: int64

In [5]:
# Find number of unique values in each column
movies_df_clean.nunique()

rotten_tomatoes_link                14437
movie_title                         14002
movie_info                          14436
content_rating                          6
genres                                990
directors                            7093
authors                             11612
actors                              14435
original_release_date                5337
streaming_release_date               2066
runtime                               174
production_company                   2431
tomatometer_status                      2
tomatometer_rating                    101
tomatometer_count                     389
audience_status                         2
audience_rating                        98
audience_count                      10320
tomatometer_top_critics_count          69
tomatometer_fresh_critics_count       345
tomatometer_rotten_critics_count      199
dtype: int64

## Genres

In [6]:
# Look at genres value counts for potential binning
genres = movies_df_clean["genres"]
genres.value_counts()

Drama                                                                            1600
Comedy                                                                           1078
Comedy, Drama                                                                     792
Drama, Mystery & Suspense                                                         652
Art House & International, Drama                                                  492
                                                                                 ... 
Action & Adventure, Documentary, Television, Sports & Fitness                       1
Art House & International, Drama, Mystery & Suspense, Gay & Lesbian                 1
Comedy, Kids & Family, Science Fiction & Fantasy, Romance                           1
Drama, Musical & Performing Arts, Science Fiction & Fantasy, Special Interest       1
Action & Adventure, Drama, Horror, Kids & Family, Mystery & Suspense                1
Name: genres, Length: 990, dtype: int64

In [7]:
# Transform genres to lists of genres to find all unique genres and value counts
genres = genres.str.split(", ")
genres.explode().value_counts()

Drama                        8349
Comedy                       4968
Action & Adventure           3128
Mystery & Suspense           3114
Art House & International    2020
Romance                      1748
Horror                       1670
Science Fiction & Fantasy    1601
Classics                     1542
Kids & Family                 962
Documentary                   892
Musical & Performing Arts     686
Special Interest              649
Animation                     485
Western                       282
Television                    151
Sports & Fitness              128
Cult Movies                    91
Gay & Lesbian                  65
Faith & Spirituality           58
Anime & Manga                  14
Name: genres, dtype: int64

In [8]:
# Create list of unique genres
genre_list = genres.explode().value_counts().index.tolist()
genre_list

['Drama',
 'Comedy',
 'Action & Adventure',
 'Mystery & Suspense',
 'Art House & International',
 'Romance',
 'Horror',
 'Science Fiction & Fantasy',
 'Classics',
 'Kids & Family',
 'Documentary',
 'Musical & Performing Arts',
 'Special Interest',
 'Animation',
 'Western',
 'Television',
 'Sports & Fitness',
 'Cult Movies',
 'Gay & Lesbian',
 'Faith & Spirituality',
 'Anime & Manga']

In [9]:
# Create DataFrame with separate column for each genre
genre_df = pd.DataFrame(movies_df_clean["genres"])

for genre in genre_list:
    genre_df[genre] = genre_df["genres"].apply(lambda x: 1 if genre in x else 0)

# Add column for total number of genres
genre_df["total_genres"] = genre_df[genre_list].sum(axis=1)
    
genre_df

Unnamed: 0,genres,Drama,Comedy,Action & Adventure,Mystery & Suspense,Art House & International,Romance,Horror,Science Fiction & Fantasy,Classics,...,Special Interest,Animation,Western,Television,Sports & Fitness,Cult Movies,Gay & Lesbian,Faith & Spirituality,Anime & Manga,total_genres
0,"Action & Adventure, Comedy, Drama, Science Fic...",1,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,4
1,Comedy,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,"Comedy, Romance",0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,"Classics, Drama",1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,2
4,"Action & Adventure, Drama, Kids & Family",1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17706,"Action & Adventure, Comedy, Kids & Family",0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
17707,"Drama, Musical & Performing Arts",1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
17708,"Action & Adventure, Animation, Comedy",0,1,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,3
17710,"Classics, Drama",1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,2


## Movie Title & Info

In [10]:
# Create DataFrame with text length analysis (word and character counts)
text_df = movies_df_clean[["movie_title", "movie_info"]].copy()
text_df["title_word_count"] = text_df["movie_title"].apply(lambda x: len(x.split()))
text_df["title_char_count"] = text_df["movie_title"].apply(lambda x: len(x))
text_df["info_word_count"] = text_df["movie_info"].apply(lambda x: len(x.split()))
text_df["info_char_count"] = text_df["movie_info"].apply(lambda x: len(x))
text_df

Unnamed: 0,movie_title,movie_info,title_word_count,title_char_count,info_word_count,info_char_count
0,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",8,50,79,454
1,Please Give,Kate (Catherine Keener) and her husband Alex (...,2,11,83,486
2,10,"A successful, middle-aged Hollywood songwriter...",1,2,48,279
3,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,6,31,76,450
4,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...",5,28,78,489
...,...,...,...,...,...,...
17706,Zoom,"Capt. Zoom, or Jack (Tim Allen), as he is now ...",1,4,53,291
17707,Zoot Suit,Mexican-American gangster Henry Reyna (Daniel ...,2,9,86,491
17708,Zootopia,From the largest elephant to the smallest shre...,1,8,79,468
17710,Zulu,"In 1879, the Zulu nation hands colonial Britis...",1,4,77,488


## Final Preprocessed Data

In [11]:
# Create preprocessed DataFrame for ML model
movies_preprocessed_df = pd.get_dummies(movies_df_clean[["content_rating","runtime"]])

# Merge genre_df 
movies_preprocessed_df = movies_preprocessed_df.merge(genre_df.drop("genres", axis=1),left_index=True, right_index=True)

# Merge text_df
movies_preprocessed_df = movies_preprocessed_df.merge(text_df.drop(columns=["movie_title", "movie_info"]),left_index=True, right_index=True)

# Add target column
movies_preprocessed_df["tomatometer_status"] = movies_df_clean["tomatometer_status"]

movies_preprocessed_df

Unnamed: 0,runtime,content_rating_G,content_rating_NC17,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R,Drama,Comedy,Action & Adventure,...,Cult Movies,Gay & Lesbian,Faith & Spirituality,Anime & Manga,total_genres,title_word_count,title_char_count,info_word_count,info_char_count,tomatometer_status
0,119.0,0,0,0,1,0,0,1,1,1,...,0,0,0,0,4,8,50,79,454,Rotten
1,90.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,1,2,11,83,486,Fresh
2,122.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,2,1,2,48,279,Fresh
3,95.0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,2,6,31,76,450,Fresh
4,127.0,1,0,0,0,0,0,1,0,1,...,0,0,0,0,3,5,28,78,489,Fresh
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17706,88.0,0,0,0,1,0,0,0,1,1,...,0,0,0,0,3,1,4,53,291,Rotten
17707,104.0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,2,2,9,86,491,Rotten
17708,108.0,0,0,0,1,0,0,0,1,1,...,0,0,0,0,3,1,8,79,468,Fresh
17710,135.0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,2,1,4,77,488,Fresh


In [12]:
# Export preprocessed data to file
movies_preprocessed_df.to_csv("movies_data_preprocessed.csv", index=False)