In [1]:
# Import dependencies
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Load preprocessed data
movies_df = pd.read_csv("movies_data_preprocessed.csv")
movies_df.head()

Unnamed: 0,runtime,content_rating_G,content_rating_NC17,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R,Drama,Comedy,Action & Adventure,...,Cult Movies,Gay & Lesbian,Faith & Spirituality,Anime & Manga,total_genres,title_word_count,title_char_count,info_word_count,info_char_count,tomatometer_status
0,119.0,0,0,0,1,0,0,1,1,1,...,0,0,0,0,4,8,50,79,454,Rotten
1,90.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,1,2,11,83,486,Fresh
2,122.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,2,1,2,48,279,Fresh
3,95.0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,2,6,31,76,450,Fresh
4,127.0,1,0,0,0,0,0,1,0,1,...,0,0,0,0,3,5,28,78,489,Fresh


In [3]:
# Define the target set
y = movies_df["tomatometer_status"]
y.value_counts()

Fresh     7809
Rotten    6628
Name: tomatometer_status, dtype: int64

In [4]:
# Define the features set
X = movies_df.drop("tomatometer_status", axis=1)
X.head()

Unnamed: 0,runtime,content_rating_G,content_rating_NC17,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R,Drama,Comedy,Action & Adventure,...,Sports & Fitness,Cult Movies,Gay & Lesbian,Faith & Spirituality,Anime & Manga,total_genres,title_word_count,title_char_count,info_word_count,info_char_count
0,119.0,0,0,0,1,0,0,1,1,1,...,0,0,0,0,0,4,8,50,79,454
1,90.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,1,2,11,83,486
2,122.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,2,1,2,48,279
3,95.0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,2,6,31,76,450
4,127.0,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,3,5,28,78,489


In [5]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Create a random forest classifier
model = RandomForestClassifier(n_estimators=128, random_state=1)

In [7]:
# Fitting the model
model = model.fit(X_train, y_train)

In [8]:
# Making predictions using the testing data
y_pred = model.predict(X_test)
y_pred

array(['Rotten', 'Fresh', 'Rotten', ..., 'Rotten', 'Fresh', 'Rotten'],
      dtype=object)

In [9]:
# Evaluate model and display results
print("Confusion Matrix")
display(confusion_matrix(y_test, y_pred))
print(f"Accuracy Score : {accuracy_score(y_test, y_pred)}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


array([[1404,  571],
       [ 608, 1027]], dtype=int64)

Accuracy Score : 0.6734072022160665
Classification Report
              precision    recall  f1-score   support

       Fresh       0.70      0.71      0.70      1975
      Rotten       0.64      0.63      0.64      1635

    accuracy                           0.67      3610
   macro avg       0.67      0.67      0.67      3610
weighted avg       0.67      0.67      0.67      3610



In [10]:
# List the features sorted in descending order by feature importance
for importance, feature in sorted(zip(model.feature_importances_, X.columns), reverse=True):
    print(f"{feature}: {importance}")

runtime: 0.1658739044929453
info_char_count: 0.1644626153354271
info_word_count: 0.14360399187171738
title_char_count: 0.1291335035892085
title_word_count: 0.06512051567547807
total_genres: 0.03993590508480537
Classics: 0.03198856088530258
content_rating_NR: 0.029152721242087065
Art House & International: 0.021419506003871974
Drama: 0.02123924959952311
Documentary: 0.018609787050418766
Comedy: 0.017912548335970974
Action & Adventure: 0.01615040366782321
Mystery & Suspense: 0.015884436811694703
content_rating_PG-13: 0.015491788288239335
content_rating_R: 0.012745151601802637
Romance: 0.012244949243776322
Horror: 0.012014147803840396
Science Fiction & Fantasy: 0.011626417665070055
content_rating_PG: 0.008539753007026632
Musical & Performing Arts: 0.00797389263919024
Special Interest: 0.00728474677791859
Kids & Family: 0.006421734367288615
Animation: 0.005706925139486366
content_rating_G: 0.005014571328807379
Western: 0.004077777146919002
Television: 0.0022701538972431925
Sports & Fitness