In [160]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.python import keras
from keras import layers
from keras.layers.experimental.preprocessing import TextVectorization
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [161]:
# Load the movies CSV file
df = pd.read_csv('inputs/imdb_movies.csv')
df['crew'] = df['crew'].str.split(',').str[0]
# Extract the relevant features and target variable
features = df[['genre', 'orig_lang', 'budget_x', 'country']]
target = df['score']

In [162]:
# Perform preprocessing steps for each feature

# 1. Encode categorical features
encoder = LabelEncoder()
features['genre'] = encoder.fit_transform(features['genre'])
features['orig_lang'] = encoder.fit_transform(features['orig_lang'])
features['country'] = encoder.fit_transform(features['country'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['genre'] = encoder.fit_transform(features['genre'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['orig_lang'] = encoder.fit_transform(features['orig_lang'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['country'] = encoder.fit_transform(features['country'])


In [163]:
# 2. Handle missing values
features.fillna(value=0, inplace=True)  # Replace missing values with 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features.fillna(value=0, inplace=True)  # Replace missing values with 0


In [164]:
# 3. Scale numerical features using Min-Max scaling
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)

In [165]:
# 4. Convert text data to numerical data

# Convert 'crew' column to a Pandas Series
crew_series = df['crew'].fillna('')

# Preprocess text data
text_vectorizer = TextVectorization(output_mode='int', output_sequence_length=100)
text_vectorizer.adapt(crew_series.values)

# Encode the 'crew' column
crew_vectorized = text_vectorizer(crew_series.values)

# Convert 'overview' column to a Pandas Series
overview_series = df['overview'].fillna('')

# Preprocess text data
text_vectorizer = TextVectorization(output_mode='int', output_sequence_length=1000)
text_vectorizer.adapt(overview_series.values)

# Encode the 'overview' column
overview_vectorized = text_vectorizer(overview_series.values)

# # Convert 'orig_title' column to a Pandas Series
# orig_title_series = df['orig_title'].fillna('')

# # Preprocess text data
# text_vectorizer = TextVectorization(output_mode='int', output_sequence_length=1000)
# text_vectorizer.adapt(orig_title_series.values)

# # Encode the 'orig_title' column
# orig_title_vectorized = text_vectorizer(orig_title_series.values)

# Combine the encoded 'crew' column with the numerical features
features_encoded = np.concatenate([features_scaled, crew_vectorized, overview_vectorized], axis=1)

# Reshape the features_encoded array
features_encoded = np.expand_dims(features_encoded, axis=-1)

In [166]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.2, random_state=42)

In [167]:
# Build the CNN model
model = tf.keras.Sequential([
    layers.Conv1D(32, kernel_size=3, activation='relu', input_shape=(features_encoded.shape[1], 1)),
    layers.MaxPooling1D(pool_size=2),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)  # Output layer for regression
])

In [168]:
# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

In [169]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x28f220f70>

In [170]:
# Evaluate the model
mse = model.evaluate(X_test, y_test)

# Mean Squared Error
print('Mean Squared Error:', mse)

 1/64 [..............................] - ETA: 0s - loss: 317.8849

Mean Squared Error: 201.82089233398438


In [171]:
# Reshape the input data
y_test = y_test.values.reshape(-1)

# Evaluate the model on the reshaped data
predictions = model.predict(X_test)
mse = np.mean((predictions - y_test)**2)
rmse = np.sqrt(mse)
mae = np.mean(np.abs(predictions - y_test))

print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', rmse)
print('Mean Absolute Error:', mae)

Mean Squared Error: 221.41247476912787
Root Mean Squared Error: 14.879935307961787
Mean Absolute Error: 10.294869275928866
