In [1]:
# Import dependencies
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Load preprocessed data
movies_df = pd.read_csv("movies_data_preprocessed.csv")
movies_df.head()

Unnamed: 0,runtime,content_rating_G,content_rating_NC17,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R,Drama,Comedy,Action & Adventure,...,th Century,Paramount,top_company,top_director,mid_director,low_director,one_director,top_actor,mid_actor,tomatometer_status
0,119.0,0,0,0,1,0,0,1,1,1,...,1,0,1,0,1,0,0,0,8,Rotten
1,90.0,0,0,0,0,0,1,0,1,0,...,0,0,1,0,0,1,0,2,3,Fresh
2,122.0,0,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,0,1,Fresh
3,95.0,0,0,1,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,2,Fresh
4,127.0,1,0,0,0,0,0,1,0,1,...,0,0,1,0,1,0,0,0,3,Fresh


In [3]:
# Define the target set
y = movies_df["tomatometer_status"]
y.value_counts()

Fresh     7809
Rotten    6628
Name: tomatometer_status, dtype: int64

In [4]:
# Define the features set
X = movies_df.drop(columns=["tomatometer_status", "clean_text"])
X.head()

Unnamed: 0,runtime,content_rating_G,content_rating_NC17,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R,Drama,Comedy,Action & Adventure,...,Universal,th Century,Paramount,top_company,top_director,mid_director,low_director,one_director,top_actor,mid_actor
0,119.0,0,0,0,1,0,0,1,1,1,...,0,1,0,1,0,1,0,0,0,8
1,90.0,0,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,1,0,2,3
2,122.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,1,0,0,0,0,1
3,95.0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,2
4,127.0,1,0,0,0,0,0,1,0,1,...,0,0,0,1,0,1,0,0,0,3


In [5]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Create a random forest classifier
model = RandomForestClassifier(n_estimators=250, max_depth=15, random_state=1)

In [7]:
# Fitting the model
model = model.fit(X_train, y_train)

In [8]:
# Making predictions using the testing data
y_pred = model.predict(X_test)
y_pred

array(['Rotten', 'Fresh', 'Rotten', ..., 'Rotten', 'Fresh', 'Fresh'],
      dtype=object)

In [9]:
# Evaluate model and display results
print("Confusion Matrix")
display(confusion_matrix(y_test, y_pred))
print(f"Accuracy Score : {accuracy_score(y_test, y_pred)}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


array([[1411,  564],
       [ 510, 1125]], dtype=int64)

Accuracy Score : 0.7024930747922438
Classification Report
              precision    recall  f1-score   support

       Fresh       0.73      0.71      0.72      1975
      Rotten       0.67      0.69      0.68      1635

    accuracy                           0.70      3610
   macro avg       0.70      0.70      0.70      3610
weighted avg       0.70      0.70      0.70      3610



In [10]:
# List the features sorted in descending order by feature importance
for importance, feature in sorted(zip(model.feature_importances_, X.columns), reverse=True):
    print(f"{feature}: {importance}")

runtime: 0.10803008504964079
info_char_count: 0.09516437088453784
info_word_count: 0.08275527676451658
mid_actor: 0.07051186839661383
title_char_count: 0.0703344699083734
content_rating_NR: 0.049628917260209286
Classics: 0.04945843529027765
title_word_count: 0.041070519804334446
total_genres: 0.036148752911653786
Art House & International: 0.030397005918304944
Drama: 0.029973710315678737
Documentary: 0.02825874616275449
one_director: 0.024099291329743033
top_actor: 0.02286236436516524
content_rating_PG-13: 0.02136081223449855
low_director: 0.017829574935831906
mid_director: 0.015821706957775336
Action & Adventure: 0.015306503356109152
Comedy: 0.015226446490868389
Mystery & Suspense: 0.014709743178400394
content_rating_R: 0.014394150656473549
Horror: 0.013765001973089283
top_company: 0.01170481274627593
top_director: 0.009723556822697123
Science Fiction & Fantasy: 0.00970426184536211
Romance: 0.00952195352932574
Special Interest: 0.008740672899314612
content_rating_PG: 0.008128850765455