In [1]:
# Import dependencies
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Load preprocessed data
movies_df = pd.read_csv("movies_data_preprocessed.csv")
movies_df.head()

Unnamed: 0,runtime,content_rating_G,content_rating_NC17,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R,Drama,Comedy,Action & Adventure,...,th Century,Paramount,top_company,top_director,mid_director,low_director,one_director,top_actor,mid_actor,tomatometer_status
0,119.0,0,0,0,1,0,0,1,1,1,...,1,0,1,0,1,0,0,0,8,Rotten
1,90.0,0,0,0,0,0,1,0,1,0,...,0,0,1,0,0,1,0,2,3,Fresh
2,122.0,0,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,0,1,Fresh
3,95.0,0,0,1,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,2,Fresh
4,127.0,1,0,0,0,0,0,1,0,1,...,0,0,1,0,1,0,0,0,3,Fresh


In [3]:
# Define the target set
y = movies_df["tomatometer_status"]
y.value_counts()

Fresh     7809
Rotten    6628
Name: tomatometer_status, dtype: int64

In [4]:
# Define the features set
X = movies_df.drop("tomatometer_status", axis=1)
X.head()

Unnamed: 0,runtime,content_rating_G,content_rating_NC17,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R,Drama,Comedy,Action & Adventure,...,Universal,th Century,Paramount,top_company,top_director,mid_director,low_director,one_director,top_actor,mid_actor
0,119.0,0,0,0,1,0,0,1,1,1,...,0,1,0,1,0,1,0,0,0,8
1,90.0,0,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,1,0,2,3
2,122.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,1,0,0,0,0,1
3,95.0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,2
4,127.0,1,0,0,0,0,0,1,0,1,...,0,0,0,1,0,1,0,0,0,3


In [5]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# TF-IDF on clean_text
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_vectors = vectorizer.fit_transform(X_train["clean_text"]) 
X_test_vectors = vectorizer.transform(X_test["clean_text"])

In [7]:
# Merge original train/test DataFrames with new TF-IDF feature columns
tfidf_train_df = pd.DataFrame(X_train_vectors.toarray(), columns=vectorizer.get_feature_names_out(), index=X_train.index)
X_train_tfidf = X_train.drop("clean_text", axis=1).merge(tfidf_train_df, left_index=True, right_index=True)
tfidf_test_df = pd.DataFrame(X_test_vectors.toarray(), columns=vectorizer.get_feature_names_out(), index=X_test.index)
X_test_tfidf = X_test.drop("clean_text", axis=1).merge(tfidf_test_df, left_index=True, right_index=True)

In [8]:
# Create a random forest classifier
model = RandomForestClassifier(n_estimators=600, max_depth=200, random_state=1)

In [9]:
# Fitting the model
model = model.fit(X_train_tfidf, y_train)

In [10]:
# Making predictions using the testing data
y_pred = model.predict(X_test_tfidf)
y_pred

array(['Rotten', 'Fresh', 'Rotten', ..., 'Rotten', 'Fresh', 'Rotten'],
      dtype=object)

In [11]:
# Evaluate model and display results
print("Confusion Matrix")
display(confusion_matrix(y_test, y_pred))
print(f"Accuracy Score : {accuracy_score(y_test, y_pred)}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


array([[1485,  490],
       [ 567, 1068]], dtype=int64)

Accuracy Score : 0.707202216066482
Classification Report
              precision    recall  f1-score   support

       Fresh       0.72      0.75      0.74      1975
      Rotten       0.69      0.65      0.67      1635

    accuracy                           0.71      3610
   macro avg       0.70      0.70      0.70      3610
weighted avg       0.71      0.71      0.71      3610



In [12]:
# List the features sorted in descending order by feature importance
for importance, feature in sorted(zip(model.feature_importances_, X_train_tfidf.columns), reverse=True):
    print(f"{feature}: {importance}")

runtime: 0.015954448275934452
content_rating_NR: 0.01499476583482065
mid_actor: 0.014651512477123233
Classics: 0.014109382170972792
info_char_count: 0.012178972258200676
info_word_count: 0.010831634289848843
title_char_count: 0.008216914552760602
Art House & International: 0.00786252364619544
Drama: 0.007171512137755916
Documentary: 0.006913264157016564
content_rating_PG-13: 0.006603325289674685
total_genres: 0.006257351569095503
title_word_count: 0.0051052837718813665
content_rating_R: 0.004204163439976329
one_director: 0.004112601507632792
top_actor: 0.0036845558084213944
Comedy: 0.003353766853405384
Horror: 0.003129883008557813
film: 0.0029675654366105833
find: 0.0027735147125636608
top_director: 0.0027449676884945835
young: 0.0026346446525511726
Special Interest: 0.002601400189532079
life: 0.0025459627576234745
man: 0.0025224781987688177
Action & Adventure: 0.002509510697673252
Mystery & Suspense: 0.0023939327296112136
becom: 0.002353082334432637
new: 0.002345266591480326
mid_direc

orphanag: 6.985785772471145e-05
samuel jackson: 6.985373723615171e-05
must decid: 6.98283021555934e-05
inept: 6.982148877591327e-05
peopl live: 6.978949420752456e-05
parisian: 6.976563107399648e-05
huston: 6.975339946258605e-05
africanamerican: 6.970313440136065e-05
bacon: 6.966232112889298e-05
humor: 6.965372501798325e-05
norman: 6.962609927868926e-05
belief: 6.961972178718761e-05
edgar: 6.950642150561746e-05
take new: 6.94528320574399e-05
guardian: 6.944588223694417e-05
defi: 6.928743498954336e-05
servant: 6.926393978884184e-05
nanni: 6.924709577005634e-05
outrag: 6.924020514547863e-05
lee jone: 6.921675630502681e-05
hike: 6.919950534469538e-05
hockey: 6.919901687897325e-05
bean: 6.917875518271399e-05
harmoni: 6.912657639717789e-05
save world: 6.90318582479074e-05
obstacl: 6.90024357150498e-05
upcom: 6.897829413835708e-05
mysteri man: 6.895918376513344e-05
regain: 6.890841521354124e-05
julian: 6.890641566936401e-05
jame bond: 6.889131631695034e-05
michel: 6.881376539469358e-05
christ