In [1]:
# Import dependencies
import pandas as pd
from sqlalchemy import create_engine
from config import db_password
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pickle

In [2]:
# Connect to SQL database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Rotten_Tomato_Analysis"
engine = create_engine(db_string)

In [3]:
# Load table from database to DataFrame
movies_df = pd.read_sql_query('SELECT * FROM full_table_rotten_analysis',con=engine)
movies_df

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,content_rating,genres,directors,authors,actors,runtime,production_company,...,streaming_month,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",119.0,20th Century Fox,...,11,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",90.0,Sony Pictures Classics,...,9,Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19
2,m/10,10,"A successful, middle-aged Hollywood songwriter...",R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",122.0,Waner Bros.,...,7,Fresh,67.0,24.0,Spilled,53.0,14684.0,2,16,8
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",95.0,Criterion Collection,...,1,Fresh,100.0,54.0,Upright,97.0,105386.0,6,54,0
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",127.0,Disney,...,6,Fresh,89.0,27.0,Upright,74.0,68918.0,5,24,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14432,m/zoom_2006,Zoom,"Capt. Zoom, or Jack (Tim Allen), as he is now ...",PG,"Action & Adventure, Comedy, Kids & Family",Peter Hewitt,"Adam Rifkin, David Berenbaum","Tim Allen, Courteney Cox, Chevy Chase, Spencer...",88.0,Sony Pictures Entertainment,...,4,Rotten,4.0,68.0,Spilled,33.0,11369.0,19,3,65
14433,m/zoot_suit,Zoot Suit,Mexican-American gangster Henry Reyna (Daniel ...,R,"Drama, Musical & Performing Arts",Luis Valdez,Luis Valdez,"Daniel Valdez, Edward James Olmos, Charles Aid...",104.0,MCA Universal Home Video,...,4,Rotten,56.0,9.0,Upright,74.0,1195.0,2,5,4
14434,m/zootopia,Zootopia,From the largest elephant to the smallest shre...,PG,"Action & Adventure, Animation, Comedy","Byron Howard, Rich Moore, Jared Bush","Jared Bush, Phil Johnston","J.K. Simmons, Kristen Bell, Octavia Spencer, A...",108.0,Walt Disney Animation Studios,...,6,Fresh,98.0,291.0,Upright,92.0,101511.0,50,285,7
14435,m/zulu,Zulu,"In 1879, the Zulu nation hands colonial Britis...",PG,"Classics, Drama","Cy Endfield, Cyril Endfield","Cy Endfield, John Prebble","Stanley Baker, Jack Hawkins, Ulla Jacobsson, J...",135.0,Paramount Pictures,...,1,Fresh,96.0,23.0,Upright,91.0,30193.0,6,22,1


In [4]:
# Create preprocessed DataFrame for ML model
movies_preprocessed_df = pd.get_dummies(movies_df[["content_rating","runtime"]])
movies_preprocessed_df.head()

Unnamed: 0,runtime,content_rating_G,content_rating_NC17,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R
0,119.0,0,0,0,1,0,0
1,90.0,0,0,0,0,0,1
2,122.0,0,0,0,0,0,1
3,95.0,0,0,1,0,0,0
4,127.0,1,0,0,0,0,0


In [5]:
# Create list of unique genres
genre_list = movies_df["genres"].str.split(", ").explode().value_counts().index.tolist()

# Add separate binary column for each genre
for genre in genre_list:
    movies_preprocessed_df[genre] = movies_df["genres"].apply(lambda x: 1 if genre in x else 0)

# Add column for total number of genres
movies_preprocessed_df["total_genres"] = movies_preprocessed_df[genre_list].sum(axis=1)

movies_preprocessed_df.head()

Unnamed: 0,runtime,content_rating_G,content_rating_NC17,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R,Drama,Comedy,Action & Adventure,...,Special Interest,Animation,Western,Television,Sports & Fitness,Cult Movies,Gay & Lesbian,Faith & Spirituality,Anime & Manga,total_genres
0,119.0,0,0,0,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,4
1,90.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,122.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,2
3,95.0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,2
4,127.0,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,3


In [6]:
# Add columns for text length analysis (word and character counts)
movies_preprocessed_df["title_word_count"] = movies_df["movie_title"].apply(lambda x: len(x.split()))
movies_preprocessed_df["title_char_count"] = movies_df["movie_title"].apply(lambda x: len(x))
movies_preprocessed_df["info_word_count"] = movies_df["movie_info"].apply(lambda x: len(x.split()))
movies_preprocessed_df["info_char_count"] = movies_df["movie_info"].apply(lambda x: len(x))
movies_preprocessed_df.head()

Unnamed: 0,runtime,content_rating_G,content_rating_NC17,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R,Drama,Comedy,Action & Adventure,...,Sports & Fitness,Cult Movies,Gay & Lesbian,Faith & Spirituality,Anime & Manga,total_genres,title_word_count,title_char_count,info_word_count,info_char_count
0,119.0,0,0,0,1,0,0,1,1,1,...,0,0,0,0,0,4,8,50,79,454
1,90.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,1,2,11,83,486
2,122.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,2,1,2,48,279
3,95.0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,2,6,31,76,450
4,127.0,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,3,5,28,78,489


In [7]:
# Define the target set
y = movies_df["tomatometer_status"]
y.value_counts()

Fresh     7809
Rotten    6628
Name: tomatometer_status, dtype: int64

In [8]:
# Define the features set
X = movies_preprocessed_df
X.head()

Unnamed: 0,runtime,content_rating_G,content_rating_NC17,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R,Drama,Comedy,Action & Adventure,...,Sports & Fitness,Cult Movies,Gay & Lesbian,Faith & Spirituality,Anime & Manga,total_genres,title_word_count,title_char_count,info_word_count,info_char_count
0,119.0,0,0,0,1,0,0,1,1,1,...,0,0,0,0,0,4,8,50,79,454
1,90.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,1,2,11,83,486
2,122.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,2,1,2,48,279
3,95.0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,2,6,31,76,450
4,127.0,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,3,5,28,78,489


In [9]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=27)

In [10]:
# Create a random forest classifier
model = RandomForestClassifier(n_estimators=600, max_depth=12, random_state=1)

# Fitting the model
model = model.fit(X_train, y_train)

# Making predictions using the testing data
y_pred = model.predict(X_test)

# Evaluate model and display results
print("Confusion Matrix")
display(confusion_matrix(y_test, y_pred))
print(f"Accuracy Score : {accuracy_score(y_test, y_pred)}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


array([[1419,  558],
       [ 560, 1073]], dtype=int64)

Accuracy Score : 0.6903047091412743
Classification Report
              precision    recall  f1-score   support

       Fresh       0.72      0.72      0.72      1977
      Rotten       0.66      0.66      0.66      1633

    accuracy                           0.69      3610
   macro avg       0.69      0.69      0.69      3610
weighted avg       0.69      0.69      0.69      3610



In [11]:
# Save the model
filename = 'finalized_model.pkl'
pickle.dump(model, open(filename, 'wb'))
 
# Load the model
loaded_model = pickle.load(open(filename, 'rb'))
loaded_model.score(X_test, y_test)

0.6903047091412743