In [1]:
import pandas as pd
import numpy as np
from numpy.random import seed
seed(42)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import tensorflow
tensorflow.keras.__version__

'2.2.4-tf'

## Data Preprocessing

In [2]:
# Read in movie csv
movies = pd.read_csv("../Resources/imdb_final.csv")
movies.head()

Unnamed: 0,imdb_title_id,title,year,genre,duration,country,director,production_company,budget,total_votes,...,allover45,males,males18to29,males30to44,malesover45,females,females18to29,females30to44,femalesover45,rating_class
0,tt0035423,Kate & Leopold,2001,Comedy,118,USA,James Mangold,Konrad Pictures,48000000,75298,...,6.4,6.3,6.5,6.2,6.3,6.6,6.7,6.4,6.7,Good
1,tt0113026,The Fantasticks,2000,Musical,86,USA,Michael Ritchie,Michael Ritchie Productions,10000000,1082,...,5.4,5.3,5.4,5.4,5.3,5.7,5.6,5.8,5.6,Good
2,tt0118589,Glitter,2001,Drama,104,USA,Vondie Curtis-Hall,Twentieth Century Fox,22000000,20959,...,2.1,1.9,2.1,1.9,2.0,2.9,3.2,2.7,2.5,Bad
3,tt0118652,The Attic Expeditions,2001,Comedy,100,USA,Jeremy Kasten,Tse Tse Fly Productions,1000000,1588,...,4.6,5.0,4.7,5.1,4.6,4.8,5.8,4.6,4.6,Good
4,tt0120467,Vulgar,2000,Crime,87,USA,Bryan Johnson,Chango Productions,120000,3852,...,5.0,5.3,6.1,5.3,5.1,5.2,5.2,5.3,4.6,Good


In [3]:
# # Set vote column to integer type
# movies["median_vote"] = movies["median_vote"].astype(int)
# movies["median_vote"].unique()

In [4]:
# Set values for X and y
X = movies[["year", "genre", "duration", "director", "budget"]]
y = movies["rating_class"].values.reshape(-1,1)
print(X.shape, y.shape)

(5060, 5) (5060, 1)


In [5]:
data = X.copy()
data

Unnamed: 0,year,genre,duration,director,budget
0,2001,Comedy,118,James Mangold,48000000
1,2000,Musical,86,Michael Ritchie,10000000
2,2001,Drama,104,Vondie Curtis-Hall,22000000
3,2001,Comedy,100,Jeremy Kasten,1000000
4,2000,Crime,87,Bryan Johnson,120000
...,...,...,...,...,...
5055,2019,Comedy,84,Jon Lucas,5000000
5056,2019,Drama,94,Dan Sallitt,95000
5057,2019,Action,84,Glenn Miller,100000
5058,2019,Action,92,Keoni Waxman,3000000


In [6]:
# # Dummy Encoding for genre column
data_binary_encoded = pd.get_dummies(data, columns= ["genre", "director"])
data_binary_encoded.head()

Unnamed: 0,year,duration,budget,genre_Action,genre_Adventure,genre_Animation,genre_Biography,genre_Comedy,genre_Crime,genre_Drama,...,director_Zack Snyder,director_Zackary Adler,director_Zak Knutson,director_Zak Penn,director_Zebediah De Soto,director_Zia Mojabi,director_Ziad H. Hamzeh,director_Zoe Quist,director_Zoran Lisinac,director_mink
0,2001,118,48000000,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2000,86,10000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2001,104,22000000,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,2001,100,1000000,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2000,87,120000,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(data_binary_encoded, y, random_state=42, stratify= y)


In [8]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train).reshape(-1,1)
encoded_y_test = label_encoder.transform(y_test).reshape(-1,1)
encoded_y_train

  return f(**kwargs)


array([[1],
       [0],
       [0],
       ...,
       [0],
       [2],
       [2]])

In [9]:
# Create StandardScaler model and fit to training data
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(encoded_y_train)

# Transform training and testing data using X_scaler and y_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(encoded_y_train)
y_test_scaled = y_scaler.transform(encoded_y_test)
print(X_train_scaled[0])

[-1.76326177  0.8312829  -0.47813651 ...  0.         -0.01623496
 -0.01623496]


In [10]:
from tensorflow.keras.utils import to_categorical
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)
y_train_categorical[0]

array([0., 1., 0.], dtype=float32)

## Create Deep Learning Model

In [11]:
# Create deep learning model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
deep_model = Sequential()
deep_model.add(Dense(units=100, activation= "relu", input_dim=3537))
deep_model.add(Dense(units=100, activation= "relu"))
deep_model.add(Dense(units=100, activation= "relu"))
deep_model.add(Dense(units=3, activation= "softmax"))

In [12]:
# Compile and fit the model
deep_model.compile(optimizer = "adam", loss= "mse", metrics=["accuracy"])
deep_model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               353800    
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 303       
Total params: 374,303
Trainable params: 374,303
Non-trainable params: 0
_________________________________________________________________


In [13]:
deep_model.fit(X_train_scaled, y_train_categorical, epochs=100, shuffle=True, verbose=2)

Train on 3795 samples
Epoch 1/100
3795/3795 - 1s - loss: 0.1718 - accuracy: 0.6461
Epoch 2/100
3795/3795 - 1s - loss: 0.0817 - accuracy: 0.8329
Epoch 3/100
3795/3795 - 1s - loss: 0.0403 - accuracy: 0.9159
Epoch 4/100
3795/3795 - 1s - loss: 0.0338 - accuracy: 0.9249
Epoch 5/100
3795/3795 - 1s - loss: 0.0309 - accuracy: 0.9286
Epoch 6/100
3795/3795 - 1s - loss: 0.0295 - accuracy: 0.9304
Epoch 7/100
3795/3795 - 1s - loss: 0.0273 - accuracy: 0.9383
Epoch 8/100
3795/3795 - 1s - loss: 0.0257 - accuracy: 0.9399
Epoch 9/100
3795/3795 - 1s - loss: 0.0247 - accuracy: 0.9389
Epoch 10/100
3795/3795 - 1s - loss: 0.0225 - accuracy: 0.9462
Epoch 11/100
3795/3795 - 1s - loss: 0.0211 - accuracy: 0.9510
Epoch 12/100
3795/3795 - 1s - loss: 0.0203 - accuracy: 0.9507
Epoch 13/100
3795/3795 - 1s - loss: 0.0188 - accuracy: 0.9578
Epoch 14/100
3795/3795 - 1s - loss: 0.0178 - accuracy: 0.9576
Epoch 15/100
3795/3795 - 1s - loss: 0.0171 - accuracy: 0.9605
Epoch 16/100
3795/3795 - 1s - loss: 0.0165 - accuracy: 0.

<tensorflow.python.keras.callbacks.History at 0x29527745518>

## Quantify the Model

In [14]:
model_loss, model_accuracy = deep_model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Deep Neural Network - Loss: {model_loss}")
print(f"Deep Neural Network - Accuracy: {model_accuracy}")

1265/1265 - 0s - loss: 0.1936 - accuracy: 0.6372
Deep Neural Network - Loss: 0.19362074901817344
Deep Neural Network - Accuracy: 0.6371541619300842


## Make Predictions

In [15]:
encoded_predictions = deep_model.predict_classes(X_test_scaled)
prediction_labels = label_encoder.inverse_transform(encoded_predictions)
print(encoded_predictions[:10])
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {y_test[:10]}")

[1 2 2 1 2 0 2 2 2 0]
Predicted classes: ['Excellent' 'Good' 'Good' ... 'Good' 'Excellent' 'Good']
Actual Labels: [['Good']
 ['Good']
 ['Excellent']
 ['Excellent']
 ['Good']
 ['Good']
 ['Good']
 ['Good']
 ['Good']
 ['Good']]


In [16]:
# CLassification report
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction_labels))

              precision    recall  f1-score   support

         Bad       0.45      0.58      0.51       270
   Excellent       0.21      0.31      0.25       167
        Good       0.78      0.64      0.70       828

    accuracy                           0.58      1265
   macro avg       0.48      0.51      0.49      1265
weighted avg       0.63      0.58      0.60      1265



In [17]:
# Confusion matrix
from sklearn.metrics import confusion_matrix as cm
cm(y_test, prediction_labels)

array([[156,  47,  67],
       [ 30,  52,  85],
       [159, 143, 526]], dtype=int64)

## Save Model

In [20]:
deep_model.save("dl.h5")