In [1]:
# Import dependencies
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
# Load preprocessed data
movies_df = pd.read_csv("../Resources/movies_data_preprocessed.csv")
movies_df.head()

Unnamed: 0,runtime,content_rating_G,content_rating_NC17,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R,Drama,Comedy,Action & Adventure,...,th Century,Paramount,top_company,top_director,mid_director,low_director,one_director,top_actor,mid_actor,tomatometer_status
0,119.0,0,0,0,1,0,0,1,1,1,...,1,0,1,0,1,0,0,0,8,Rotten
1,90.0,0,0,0,0,0,1,0,1,0,...,0,0,1,0,0,1,0,2,3,Fresh
2,122.0,0,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,0,1,Fresh
3,95.0,0,0,1,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,2,Fresh
4,127.0,1,0,0,0,0,0,1,0,1,...,0,0,1,0,1,0,0,0,3,Fresh


In [3]:
# Define the target set
y = movies_df["tomatometer_status"].replace({"Fresh":1,"Rotten":0})
y.value_counts()

1    7809
0    6628
Name: tomatometer_status, dtype: int64

In [4]:
# Define the features set
X = movies_df.drop(columns=["tomatometer_status", "clean_text"])
X.head()

Unnamed: 0,runtime,content_rating_G,content_rating_NC17,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R,Drama,Comedy,Action & Adventure,...,Universal,th Century,Paramount,top_company,top_director,mid_director,low_director,one_director,top_actor,mid_actor
0,119.0,0,0,0,1,0,0,1,1,1,...,0,1,0,1,0,1,0,0,0,8
1,90.0,0,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,1,0,2,3
2,122.0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,1,0,0,0,0,1
3,95.0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,2
4,127.0,1,0,0,0,0,0,1,0,1,...,0,0,0,1,0,1,0,0,0,3


In [5]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Create function to fit/predict/evaluate list of models
def model_comp(models):
    for model in models:
        # Fitting the model
        model = model.fit(X_train_scaled, y_train)

        # Making predictions using the testing data
        y_pred = model.predict(X_test_scaled)

        # Evaluate model
        print(model)
        print(f"Accuracy Score : {accuracy_score(y_test, y_pred)}")
        print("-------------------------")

In [8]:
# Test accuracy of multiple classifiers
models = [RandomForestClassifier(n_estimators=250, max_depth=15, random_state=1),
          LogisticRegression(solver="lbfgs", random_state=1),
          SVC(kernel="rbf", random_state=1),
          AdaBoostClassifier(n_estimators=200, learning_rate=0.5, random_state=1)]

model_comp(models)

RandomForestClassifier(max_depth=15, n_estimators=250, random_state=1)
Accuracy Score : 0.7019390581717452
-------------------------
LogisticRegression(random_state=1)
Accuracy Score : 0.6952908587257618
-------------------------
SVC(random_state=1)
Accuracy Score : 0.7002770083102493
-------------------------
AdaBoostClassifier(learning_rate=0.5, n_estimators=200, random_state=1)
Accuracy Score : 0.692797783933518
-------------------------


In [9]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
113/113 - 0s - loss: 0.5943 - accuracy: 0.6911 - 158ms/epoch - 1ms/step
Loss: 0.5943381786346436, Accuracy: 0.6911357045173645
