In [1]:
# Import dependencies
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Load preprocessed data
movies_df = pd.read_csv("movies_data_preprocessed.csv")
movies_df.head()

Unnamed: 0,runtime,content_rating_G,content_rating_NC17,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R,Drama,Comedy,Action & Adventure,...,Animation,Western,Television,Sports & Fitness,Cult Movies,Gay & Lesbian,Faith & Spirituality,Anime & Manga,total_genres,tomatometer_status
0,119.0,0,0,0,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,4,Rotten
1,90.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,1,Fresh
2,122.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,2,Fresh
3,95.0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,2,Fresh
4,127.0,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,3,Fresh


In [3]:
# Define the target set
y = movies_df["tomatometer_status"]
y.value_counts()

Fresh     7809
Rotten    6628
Name: tomatometer_status, dtype: int64

In [4]:
# Define the features set
X = movies_df.drop("tomatometer_status", axis=1)
X.head()

Unnamed: 0,runtime,content_rating_G,content_rating_NC17,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R,Drama,Comedy,Action & Adventure,...,Special Interest,Animation,Western,Television,Sports & Fitness,Cult Movies,Gay & Lesbian,Faith & Spirituality,Anime & Manga,total_genres
0,119.0,0,0,0,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,4
1,90.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,122.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,2
3,95.0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,2
4,127.0,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,3


In [5]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Create a random forest classifier
model = RandomForestClassifier(n_estimators=128, random_state=1)

In [7]:
# Fitting the model
model = model.fit(X_train, y_train)

In [8]:
# Making predictions using the testing data
y_pred = model.predict(X_test)
y_pred

array(['Rotten', 'Fresh', 'Rotten', ..., 'Rotten', 'Fresh', 'Fresh'],
      dtype=object)

In [9]:
# Evaluate model and display results
print("Confusion Matrix")
display(confusion_matrix(y_test, y_pred))
print(f"Accuracy Score : {accuracy_score(y_test, y_pred)}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


array([[1316,  659],
       [ 685,  950]], dtype=int64)

Accuracy Score : 0.6277008310249308
Classification Report
              precision    recall  f1-score   support

       Fresh       0.66      0.67      0.66      1975
      Rotten       0.59      0.58      0.59      1635

    accuracy                           0.63      3610
   macro avg       0.62      0.62      0.62      3610
weighted avg       0.63      0.63      0.63      3610



In [10]:
# List the features sorted in descending order by feature importance
for importance, feature in sorted(zip(model.feature_importances_, X.columns), reverse=True):
    print(f"{feature}: {importance}")

runtime: 0.5354041597738443
total_genres: 0.05530015237095062
Classics: 0.048640738143739246
content_rating_NR: 0.037440522790423536
Art House & International: 0.03156174233821141
Drama: 0.03001791646963434
Documentary: 0.025499310537202998
Comedy: 0.022940634385735667
Action & Adventure: 0.020276817668137735
Mystery & Suspense: 0.01999095640476258
content_rating_PG-13: 0.018630457755554403
Science Fiction & Fantasy: 0.018156272280587755
Romance: 0.01629950571419173
Horror: 0.01591555997303894
content_rating_R: 0.014116465771220056
Musical & Performing Arts: 0.012247623238435229
Special Interest: 0.012040194563370257
content_rating_PG: 0.010604109785443301
Kids & Family: 0.01024155159141516
Animation: 0.008997941282472947
content_rating_G: 0.007923700932050588
Western: 0.007675554381146463
Television: 0.005061861726234701
Sports & Fitness: 0.0038325755526775643
Cult Movies: 0.0038011006852642043
Gay & Lesbian: 0.0036723515656921645
Faith & Spirituality: 0.0026687294378910523
content_ra