In [1]:
# Import dependencies
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Load preprocessed data
movies_df = pd.read_csv("../Resources/movies_data_preprocessed.csv")
movies_df.head()

Unnamed: 0,runtime,content_rating_G,content_rating_NC17,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R,Drama,Comedy,Action & Adventure,...,th Century,Paramount,top_company,top_director,mid_director,low_director,one_director,top_actor,mid_actor,tomatometer_status
0,119.0,0,0,0,1,0,0,1,1,1,...,1,0,1,0,1,0,0,0,8,Rotten
1,90.0,0,0,0,0,0,1,0,1,0,...,0,0,1,0,0,1,0,2,3,Fresh
2,122.0,0,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,0,1,Fresh
3,95.0,0,0,1,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,2,Fresh
4,127.0,1,0,0,0,0,0,1,0,1,...,0,0,1,0,1,0,0,0,3,Fresh


In [3]:
# Define the target set
y = movies_df["tomatometer_status"]
y.value_counts()

Fresh     7809
Rotten    6628
Name: tomatometer_status, dtype: int64

## Drop Lower Ranked Features

In [4]:
# Define the features set (drop lower ranked features from previous ML_model)
X = movies_df.drop(columns=["tomatometer_status", "clean_text", "Anime & Manga", "content_rating_NC17", "Faith & Spirituality", "Cult Movies", "Sports & Fitness", "Gay & Lesbian", "Television", "Disney", "Western", "th Century", "Universal", "Animation", "Paramount", "Kids & Family", "Warner", "Sony"])
X.head()

Unnamed: 0,runtime,content_rating_G,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R,Drama,Comedy,Action & Adventure,Mystery & Suspense,...,title_char_count,info_word_count,info_char_count,top_company,top_director,mid_director,low_director,one_director,top_actor,mid_actor
0,119.0,0,0,1,0,0,1,1,1,0,...,50,79,454,1,0,1,0,0,0,8
1,90.0,0,0,0,0,1,0,1,0,0,...,11,83,486,1,0,0,1,0,2,3
2,122.0,0,0,0,0,1,0,1,0,0,...,2,48,279,0,1,0,0,0,0,1
3,95.0,0,1,0,0,0,1,0,0,0,...,31,76,450,0,1,0,0,0,0,2
4,127.0,1,0,0,0,0,1,0,1,0,...,28,78,489,1,0,1,0,0,0,3


In [5]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Create a random forest classifier
model = RandomForestClassifier(n_estimators=500, max_depth=15, random_state=1)

# Fitting the model
model = model.fit(X_train, y_train)

# Making predictions using the testing data
y_pred = model.predict(X_test)

# Evaluate model and display results
print("Confusion Matrix")
display(confusion_matrix(y_test, y_pred))
print(f"Accuracy Score : {accuracy_score(y_test, y_pred)}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


array([[1391,  584],
       [ 511, 1124]], dtype=int64)

Accuracy Score : 0.6966759002770083
Classification Report
              precision    recall  f1-score   support

       Fresh       0.73      0.70      0.72      1975
      Rotten       0.66      0.69      0.67      1635

    accuracy                           0.70      3610
   macro avg       0.69      0.70      0.70      3610
weighted avg       0.70      0.70      0.70      3610



In [7]:
# List the features sorted in descending order by feature importance
for importance, feature in sorted(zip(model.feature_importances_, X.columns), reverse=True):
    print(f"{feature}: {importance}")

runtime: 0.11954799822761038
info_char_count: 0.10575478358416154
info_word_count: 0.09429100665478786
title_char_count: 0.08024492013446168
mid_actor: 0.0708121050070113
Classics: 0.052044465672384216
content_rating_NR: 0.045620895159469176
title_word_count: 0.04528214857550695
total_genres: 0.03856573400105059
Art House & International: 0.029798886134176385
Drama: 0.029607688986121476
Documentary: 0.028031780616476817
one_director: 0.025381134107556803
top_actor: 0.024920417416105407
low_director: 0.019730939919716518
content_rating_PG-13: 0.01954043041002153
mid_director: 0.01638882738924646
Comedy: 0.016315406506730825
Action & Adventure: 0.01600051189728099
Mystery & Suspense: 0.01596280965623103
content_rating_R: 0.015083162741600238
top_company: 0.014486749128931579
Horror: 0.013849375061860344
Science Fiction & Fantasy: 0.010546995154709268
Romance: 0.010136099109089539
Special Interest: 0.009379589475203788
top_director: 0.008898149032308702
content_rating_PG: 0.00854089981982

## Only Select Higher Ranked Features

In [8]:
# Define the features set (only select higher ranked features from previous ML_model)
X = movies_df[["runtime", "info_char_count", "info_word_count", "mid_actor", "title_char_count", "content_rating_NR", "Classics", "title_word_count", "total_genres", "Art House & International", "Drama", "Documentary", "one_director", "top_actor", "content_rating_PG-13", "low_director"]]
X.head()

Unnamed: 0,runtime,info_char_count,info_word_count,mid_actor,title_char_count,content_rating_NR,Classics,title_word_count,total_genres,Art House & International,Drama,Documentary,one_director,top_actor,content_rating_PG-13,low_director
0,119.0,454,79,8,50,0,0,8,4,0,1,0,0,0,0,0
1,90.0,486,83,3,11,0,0,2,1,0,0,0,0,2,0,1
2,122.0,279,48,1,2,0,0,1,2,0,0,0,0,0,0,0
3,95.0,450,76,2,31,1,1,6,2,0,1,0,0,0,0,0
4,127.0,489,78,3,28,0,0,5,3,0,1,0,0,0,0,0


In [9]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [10]:
# Create a random forest classifier
model = RandomForestClassifier(n_estimators=800, max_depth=12, random_state=1)

# Fitting the model
model = model.fit(X_train, y_train)

# Making predictions using the testing data
y_pred = model.predict(X_test)

# Evaluate model and display results
print("Confusion Matrix")
display(confusion_matrix(y_test, y_pred))
print(f"Accuracy Score : {accuracy_score(y_test, y_pred)}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


array([[1353,  622],
       [ 490, 1145]], dtype=int64)

Accuracy Score : 0.69196675900277
Classification Report
              precision    recall  f1-score   support

       Fresh       0.73      0.69      0.71      1975
      Rotten       0.65      0.70      0.67      1635

    accuracy                           0.69      3610
   macro avg       0.69      0.69      0.69      3610
weighted avg       0.70      0.69      0.69      3610



In [11]:
# List the features sorted in descending order by feature importance
for importance, feature in sorted(zip(model.feature_importances_, X.columns), reverse=True):
    print(f"{feature}: {importance}")

runtime: 0.14206600402311534
info_char_count: 0.11572563836639957
info_word_count: 0.10149837874860296
mid_actor: 0.08191323657221036
title_char_count: 0.08143887352892233
Classics: 0.07901031574215246
content_rating_NR: 0.07222607118754189
Documentary: 0.04945924321861605
total_genres: 0.045907311140131996
title_word_count: 0.04436257334449012
Drama: 0.041318059620567685
Art House & International: 0.04105327826270292
one_director: 0.032381029382909904
content_rating_PG-13: 0.025622475826880712
top_actor: 0.024378836330675314
low_director: 0.021638674704080332
