In [1]:
import pandas as pd
import numpy as np
from numpy.random import seed
seed(42)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import tensorflow
tensorflow.keras.__version__

'2.2.4-tf'

## Data Preprocessing

In [3]:
# Read in movie csv
movies = pd.read_csv("../Resources/imdb_final.csv")
movies.head()

Unnamed: 0,imdb_title_id,title,year,genre,duration,country,director,production_company,budget,total_votes,...,allover45,males,males18to29,males30to44,malesover45,females,females18to29,females30to44,femalesover45,rating_class
0,tt0035423,Kate & Leopold,2001,Comedy,118,USA,James Mangold,Konrad Pictures,48000000,75298,...,6.4,6.3,6.5,6.2,6.3,6.6,6.7,6.4,6.7,Good
1,tt0113026,The Fantasticks,2000,Musical,86,USA,Michael Ritchie,Michael Ritchie Productions,10000000,1082,...,5.4,5.3,5.4,5.4,5.3,5.7,5.6,5.8,5.6,Good
2,tt0118589,Glitter,2001,Drama,104,USA,Vondie Curtis-Hall,Twentieth Century Fox,22000000,20959,...,2.1,1.9,2.1,1.9,2.0,2.9,3.2,2.7,2.5,Bad
3,tt0118652,The Attic Expeditions,2001,Comedy,100,USA,Jeremy Kasten,Tse Tse Fly Productions,1000000,1588,...,4.6,5.0,4.7,5.1,4.6,4.8,5.8,4.6,4.6,Good
4,tt0120467,Vulgar,2000,Crime,87,USA,Bryan Johnson,Chango Productions,120000,3852,...,5.0,5.3,6.1,5.3,5.1,5.2,5.2,5.3,4.6,Good


In [4]:
# # Set vote column to integer type
# movies["median_vote"] = movies["median_vote"].astype(int)
# movies["median_vote"].unique()

In [5]:
# Set values for X and y
X = movies[["year", "genre", "duration", "director", "budget"]]
y = movies["rating_class"].values.reshape(-1,1)
print(X.shape, y.shape)

(5060, 5) (5060, 1)


In [6]:
data = X.copy()
data

Unnamed: 0,year,genre,duration,director,budget
0,2001,Comedy,118,James Mangold,48000000
1,2000,Musical,86,Michael Ritchie,10000000
2,2001,Drama,104,Vondie Curtis-Hall,22000000
3,2001,Comedy,100,Jeremy Kasten,1000000
4,2000,Crime,87,Bryan Johnson,120000
...,...,...,...,...,...
5055,2019,Comedy,84,Jon Lucas,5000000
5056,2019,Drama,94,Dan Sallitt,95000
5057,2019,Action,84,Glenn Miller,100000
5058,2019,Action,92,Keoni Waxman,3000000


In [7]:
# # Dummy Encoding for genre column
data_binary_encoded = pd.get_dummies(data, columns= ["genre", "director"])
data_binary_encoded.head()

Unnamed: 0,year,duration,budget,genre_Action,genre_Adventure,genre_Animation,genre_Biography,genre_Comedy,genre_Crime,genre_Drama,...,director_Zack Snyder,director_Zackary Adler,director_Zak Knutson,director_Zak Penn,director_Zebediah De Soto,director_Zia Mojabi,director_Ziad H. Hamzeh,director_Zoe Quist,director_Zoran Lisinac,director_mink
0,2001,118,48000000,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2000,86,10000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2001,104,22000000,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,2001,100,1000000,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2000,87,120000,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(data_binary_encoded, y, random_state=42)


In [9]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train).reshape(-1,1)
encoded_y_test = label_encoder.transform(y_test).reshape(-1,1)
encoded_y_train

  return f(**kwargs)


array([[2],
       [2],
       [2],
       ...,
       [2],
       [2],
       [2]])

In [10]:
# Create StandardScaler model and fit to training data
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(encoded_y_train)

# Transform training and testing data using X_scaler and y_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(encoded_y_train)
y_test_scaled = y_scaler.transform(encoded_y_test)
print(X_train_scaled[0])

[-0.58217105  1.88232088  0.01493848 ... -0.01623496 -0.01623496
 -0.01623496]


In [11]:
from tensorflow.keras.utils import to_categorical
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)
y_train_categorical[0]

array([0., 0., 1.], dtype=float32)

## Create Deep Learning Model

In [12]:
# Create deep learning model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
deep_model = Sequential()
deep_model.add(Dense(units=100, activation= "relu", input_dim=3537))
deep_model.add(Dense(units=100, activation= "relu"))
deep_model.add(Dense(units=100, activation= "relu"))
deep_model.add(Dense(units=100, activation= "relu"))
deep_model.add(Dense(units=100, activation= "relu"))
deep_model.add(Dense(units=3, activation= "softmax"))

In [13]:
# Compile and fit the model
deep_model.compile(optimizer = "adam", loss= "mse", metrics=["accuracy"])
deep_model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               353800    
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_4 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 303       
Total params: 394,503
Trainable params: 394,503
Non-trainable params: 0
__________________________________________________

In [14]:
deep_model.fit(X_train_scaled, y_train_categorical, epochs=100, shuffle=True, verbose=2)

Train on 3795 samples
Epoch 1/100
3795/3795 - 1s - loss: 0.1688 - accuracy: 0.6466
Epoch 2/100
3795/3795 - 1s - loss: 0.0929 - accuracy: 0.8024
Epoch 3/100
3795/3795 - 1s - loss: 0.0484 - accuracy: 0.8983
Epoch 4/100
3795/3795 - 1s - loss: 0.0332 - accuracy: 0.9283
Epoch 5/100
3795/3795 - 1s - loss: 0.0280 - accuracy: 0.9375
Epoch 6/100
3795/3795 - 0s - loss: 0.0266 - accuracy: 0.9383
Epoch 7/100
3795/3795 - 0s - loss: 0.0246 - accuracy: 0.9436
Epoch 8/100
3795/3795 - 0s - loss: 0.0232 - accuracy: 0.9457
Epoch 9/100
3795/3795 - 0s - loss: 0.0220 - accuracy: 0.9484
Epoch 10/100
3795/3795 - 0s - loss: 0.0207 - accuracy: 0.9526
Epoch 11/100
3795/3795 - 0s - loss: 0.0200 - accuracy: 0.9531
Epoch 12/100
3795/3795 - 0s - loss: 0.0196 - accuracy: 0.9536
Epoch 13/100
3795/3795 - 1s - loss: 0.0191 - accuracy: 0.9531
Epoch 14/100
3795/3795 - 1s - loss: 0.0185 - accuracy: 0.9578
Epoch 15/100
3795/3795 - 1s - loss: 0.0179 - accuracy: 0.9589
Epoch 16/100
3795/3795 - 1s - loss: 0.0159 - accuracy: 0.

<tensorflow.python.keras.callbacks.History at 0x2d463390c50>

## Quantify the Model

In [15]:
model_loss, model_accuracy = deep_model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Deep Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

1265/1265 - 0s - loss: 0.1945 - accuracy: 0.5992
Deep Neural Network - Loss: 0.19445292318291343, Accuracy: 0.5992094874382019


## Make Predictions

In [18]:
encoded_predictions = deep_model.predict_classes(X_test_scaled)
prediction_labels = label_encoder.inverse_transform(encoded_predictions)
print(encoded_predictions[:10])
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {y_test[:10]}")

[0 1 1 2 2 2 2 2 0 2]
Predicted classes: ['Bad' 'Excellent' 'Excellent' ... 'Good' 'Bad' 'Excellent']
Actual Labels: [['Good']
 ['Good']
 ['Excellent']
 ['Bad']
 ['Good']
 ['Good']
 ['Excellent']
 ['Good']
 ['Good']
 ['Good']]


In [20]:
# CLassification report
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction_labels))

              precision    recall  f1-score   support

         Bad       0.47      0.59      0.53       280
   Excellent       0.27      0.35      0.31       161
        Good       0.75      0.65      0.70       824

    accuracy                           0.60      1265
   macro avg       0.50      0.53      0.51      1265
weighted avg       0.63      0.60      0.61      1265



In [19]:
# Confusion matrix
from sklearn.metrics import confusion_matrix as cm
cm(y_test, prediction_labels)

array([[166,  18,  96],
       [ 27,  56,  78],
       [158, 130, 536]], dtype=int64)

## Grid Search

In [18]:
from sklearn.model_selection import GridSearchCV
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [50, 100, 150]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator =deep_model, param_grid = param_grid, verbose=2, scoring = "accuracy")

In [18]:
grid.fit(X_train_scaled, y_train_categorical)

TypeError: Cannot clone object '<tensorflow.python.keras.engine.sequential.Sequential object at 0x000002491690D080>' (type <class 'tensorflow.python.keras.engine.sequential.Sequential'>): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' method.

## Save Model

In [23]:
deep_model.save("dl.h5")