In [1]:
# Import dependencies
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Load preprocessed data
movies_df = pd.read_csv("../Resources/movies_data_preprocessed.csv")
movies_df.head()

Unnamed: 0,runtime,content_rating_G,content_rating_NC17,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R,Drama,Comedy,Action & Adventure,...,th Century,Paramount,top_company,top_director,mid_director,low_director,one_director,top_actor,mid_actor,tomatometer_status
0,119.0,0,0,0,1,0,0,1,1,1,...,1,0,1,0,1,0,0,0,8,Rotten
1,90.0,0,0,0,0,0,1,0,1,0,...,0,0,1,0,0,1,0,2,3,Fresh
2,122.0,0,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,0,1,Fresh
3,95.0,0,0,1,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,2,Fresh
4,127.0,1,0,0,0,0,0,1,0,1,...,0,0,1,0,1,0,0,0,3,Fresh


In [3]:
# Define the target set
y = movies_df["tomatometer_status"]
y.value_counts()

Fresh     7809
Rotten    6628
Name: tomatometer_status, dtype: int64

In [4]:
# Define the features set
X = movies_df.drop(columns=["tomatometer_status"])
X.head()

Unnamed: 0,runtime,content_rating_G,content_rating_NC17,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R,Drama,Comedy,Action & Adventure,...,Universal,th Century,Paramount,top_company,top_director,mid_director,low_director,one_director,top_actor,mid_actor
0,119.0,0,0,0,1,0,0,1,1,1,...,0,1,0,1,0,1,0,0,0,8
1,90.0,0,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,1,0,2,3
2,122.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,1,0,0,0,0,1
3,95.0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,2
4,127.0,1,0,0,0,0,0,1,0,1,...,0,0,0,1,0,1,0,0,0,3


In [5]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# TF-IDF on clean_text column
vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train["clean_text"]) 
X_test_tfidf = vectorizer.transform(X_test["clean_text"])

In [7]:
# Create a Naive Bayes classifier
nb = MultinomialNB()

# Fitting the model
nb.fit(X_train_tfidf, y_train) 

# Making predictions using the testing data
y_predict = nb.predict(X_test_tfidf)

# Evaluate model and display results
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

       Fresh       0.62      0.75      0.68      1975
      Rotten       0.60      0.45      0.51      1635

    accuracy                           0.61      3610
   macro avg       0.61      0.60      0.60      3610
weighted avg       0.61      0.61      0.60      3610



In [8]:
# Get "Fresh" status probabilty to use as feature in Random Forest model
y_prob_train = nb.predict_proba(X_train_tfidf)[:,0]
y_prob_test = nb.predict_proba(X_test_tfidf)[:,0]

In [9]:
# Add new probability column to original train/test DataFrames
X_train_prob = X_train.drop("clean_text", axis=1)
X_train_prob["fresh_prob"] = y_prob_train
X_test_prob = X_test.drop("clean_text", axis=1)
X_test_prob["fresh_prob"] = y_prob_test

In [10]:
# Create a random forest classifier
model = RandomForestClassifier(n_estimators=600, max_depth=20, random_state=1)

# Fitting the model
model = model.fit(X_train_prob, y_train)

# Making predictions using the testing data
y_pred = model.predict(X_test_prob)

# Evaluate model and display results
print("Confusion Matrix")
display(confusion_matrix(y_test, y_pred))
print(f"Accuracy Score : {accuracy_score(y_test, y_pred)}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


array([[1422,  553],
       [ 554, 1081]], dtype=int64)

Accuracy Score : 0.6933518005540166
Classification Report
              precision    recall  f1-score   support

       Fresh       0.72      0.72      0.72      1975
      Rotten       0.66      0.66      0.66      1635

    accuracy                           0.69      3610
   macro avg       0.69      0.69      0.69      3610
weighted avg       0.69      0.69      0.69      3610



In [11]:
# List the features sorted in descending order by feature importance
for importance, feature in sorted(zip(model.feature_importances_, X_train_prob.columns), reverse=True):
    print(f"{feature}: {importance}")

fresh_prob: 0.21738047586965512
runtime: 0.08940526449560109
info_char_count: 0.08295077094634477
info_word_count: 0.07516343250736605
title_char_count: 0.0668707078493122
mid_actor: 0.05330980987667353
title_word_count: 0.03784557783118042
total_genres: 0.030071750594792255
Classics: 0.028443964204109117
content_rating_NR: 0.02560940384460866
top_actor: 0.020591532731955985
one_director: 0.01830621324991877
Drama: 0.017674371019560833
Art House & International: 0.016504802910613903
low_director: 0.016256527078858024
content_rating_PG-13: 0.013674679088416398
Comedy: 0.013157905811216484
mid_director: 0.01290274861722138
Documentary: 0.01264777261719456
Mystery & Suspense: 0.012064537504953697
Action & Adventure: 0.011897181629584506
content_rating_R: 0.01162904639053214
top_company: 0.010330438660518759
Horror: 0.009621497866415758
Romance: 0.009026690669855779
Science Fiction & Fantasy: 0.00859502901417726
content_rating_PG: 0.006897973680612313
Sony: 0.0062571851996273496
Warner: 0.