In [1]:
import pandas as pd
import numpy as np
from numpy.random import seed
seed(42)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


In [2]:
# Read in movie csv
movies = pd.read_csv("../Resources/imdb_final.csv")
movies.head()

Unnamed: 0,imdb_title_id,title,year,genre,duration,country,director,production_company,budget,total_votes,...,allover45,males,males18to29,males30to44,malesover45,females,females18to29,females30to44,femalesover45,rating_class
0,tt0035423,Kate & Leopold,2001,Comedy,118,USA,James Mangold,Konrad Pictures,48000000,75298,...,6.4,6.3,6.5,6.2,6.3,6.6,6.7,6.4,6.7,Good
1,tt0113026,The Fantasticks,2000,Musical,86,USA,Michael Ritchie,Michael Ritchie Productions,10000000,1082,...,5.4,5.3,5.4,5.4,5.3,5.7,5.6,5.8,5.6,Good
2,tt0118589,Glitter,2001,Drama,104,USA,Vondie Curtis-Hall,Twentieth Century Fox,22000000,20959,...,2.1,1.9,2.1,1.9,2.0,2.9,3.2,2.7,2.5,Bad
3,tt0118652,The Attic Expeditions,2001,Comedy,100,USA,Jeremy Kasten,Tse Tse Fly Productions,1000000,1588,...,4.6,5.0,4.7,5.1,4.6,4.8,5.8,4.6,4.6,Good
4,tt0120467,Vulgar,2000,Crime,87,USA,Bryan Johnson,Chango Productions,120000,3852,...,5.0,5.3,6.1,5.3,5.1,5.2,5.2,5.3,4.6,Good


In [3]:
# Set values for X and y
X = movies[["year", "genre", "duration", "director", "budget"]]
y = movies["rating_class"].values.reshape(-1,1)
print(X.shape, y.shape)

(5060, 5) (5060, 1)


In [4]:
data = X.copy()
data

Unnamed: 0,year,genre,duration,director,budget
0,2001,Comedy,118,James Mangold,48000000
1,2000,Musical,86,Michael Ritchie,10000000
2,2001,Drama,104,Vondie Curtis-Hall,22000000
3,2001,Comedy,100,Jeremy Kasten,1000000
4,2000,Crime,87,Bryan Johnson,120000
...,...,...,...,...,...
5055,2019,Comedy,84,Jon Lucas,5000000
5056,2019,Drama,94,Dan Sallitt,95000
5057,2019,Action,84,Glenn Miller,100000
5058,2019,Action,92,Keoni Waxman,3000000


In [5]:
# Dummy Encoding for genre column
data_binary_encoded = pd.get_dummies(data, columns= ["genre", "director"])
data_binary_encoded.head()

Unnamed: 0,year,duration,budget,genre_Action,genre_Adventure,genre_Animation,genre_Biography,genre_Comedy,genre_Crime,genre_Drama,...,director_Zack Snyder,director_Zackary Adler,director_Zak Knutson,director_Zak Penn,director_Zebediah De Soto,director_Zia Mojabi,director_Ziad H. Hamzeh,director_Zoe Quist,director_Zoran Lisinac,director_mink
0,2001,118,48000000,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2000,86,10000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2001,104,22000000,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,2001,100,1000000,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2000,87,120000,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(data_binary_encoded, y, random_state=42)


In [7]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)
encoded_y_train

  return f(**kwargs)


array([2, 2, 2, ..., 2, 2, 2])

In [8]:
rf = RandomForestClassifier(n_estimators = 100)
rf = rf.fit(X_train, encoded_y_train)
print(f"Random Forest Training Score: {rf.score(X_train, encoded_y_train)}")
print(f"Random Forest Testing Score: {rf.score(X_test, encoded_y_test)}")

Random Forest Training Score: 1.0
Random Forest Testing Score: 0.7177865612648221


In [9]:
importances = rf.feature_importances_
importances

array([0.06126386, 0.09210161, 0.12286093, ..., 0.00064621, 0.00050031,
       0.00033323])

In [10]:
sorted(zip(rf.feature_importances_, data_binary_encoded.columns))

[(0.0, 'director_A.M. Lukas'),
 (0.0, 'director_Aaron Blaise'),
 (0.0, 'director_Aaron Burns'),
 (0.0, 'director_Aaron Mirtes'),
 (0.0, 'director_Aaron Woodley'),
 (0.0, 'director_Adam Bhala Lough'),
 (0.0, 'director_Adam Matalon'),
 (0.0, 'director_Adam Meyerowitz'),
 (0.0, 'director_Adam Thomas Anderegg'),
 (0.0, 'director_Adrienne Weiss'),
 (0.0, 'director_Akihiro Kitamura'),
 (0.0, 'director_Al Corley'),
 (0.0, 'director_Alex Ballar'),
 (0.0, 'director_Alex Keledjian'),
 (0.0, 'director_Alex Kurtzman'),
 (0.0, 'director_Alex Lvovsky'),
 (0.0, 'director_Alex Nam'),
 (0.0, 'director_Alex Wright'),
 (0.0, 'director_Alfonso Gomez-Rejon'),
 (0.0, 'director_Amanda Goodwin'),
 (0.0, 'director_Amanda Marsalis'),
 (0.0, 'director_Andrew Bellware'),
 (0.0, 'director_Andrew Black'),
 (0.0, 'director_Andrew Bowen'),
 (0.0, 'director_Andrew MacKenzie'),
 (0.0, 'director_Andrew Nackman'),
 (0.0, 'director_Andrew Van Slee'),
 (0.0, 'director_Andy Dick'),
 (0.0, 'director_Anghus Houvouras'),
 (0.0

In [11]:
encoded_predictions = rf.predict(X_test)
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

print(f"Predicted classes: {prediction_labels[:15]}")
print(f"Actual Labels: {y_test[:15]}")

Predicted classes: ['Good' 'Good' 'Good' 'Good' 'Good' 'Good' 'Good' 'Good' 'Bad' 'Good'
 'Good' 'Good' 'Good' 'Bad' 'Good']
Actual Labels: [['Good']
 ['Good']
 ['Excellent']
 ['Bad']
 ['Good']
 ['Good']
 ['Excellent']
 ['Good']
 ['Good']
 ['Good']
 ['Good']
 ['Excellent']
 ['Good']
 ['Good']
 ['Good']]


In [12]:
# CLassification report
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction_labels))

              precision    recall  f1-score   support

         Bad       0.65      0.52      0.58       280
   Excellent       0.57      0.11      0.18       161
        Good       0.74      0.90      0.81       824

    accuracy                           0.72      1265
   macro avg       0.65      0.51      0.52      1265
weighted avg       0.70      0.72      0.68      1265



In [13]:
# Confusion matrix
from sklearn.metrics import confusion_matrix as cm
cm(y_test, prediction_labels)

array([[146,   3, 131],
       [ 11,  17, 133],
       [ 69,  10, 745]], dtype=int64)

## Save Model

In [14]:
import joblib
filename = 'rf.sav'
joblib.dump(rf, filename)


['rf.sav']