# Mount Drive

In [2]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import dataset

In [3]:
train = pd.read_csv('/content/drive/MyDrive/CMPE256_project/train_final.csv')

# Get the users, movies and ratings
user_ids = train['User_ID'].values
movie_ids = train['Movie_ID'].values
ratings = train['Rating'].values

# Count the number of users and the number of movies
num_users = train['User_ID'].nunique()
num_movies = train['Movie_ID'].nunique()

In [3]:
train.shape

(14583166, 42)

In [4]:
train = train.sample(10000, random_state=42)

# Get the users, movies and ratings
user_ids = train['User_ID'].values
movie_ids = train['Movie_ID'].values
ratings = train['Rating'].values

# Count the number of users and the number of movies
num_users = train['User_ID'].nunique()
num_movies = train['Movie_ID'].nunique()

# One Hot Encoding Users and Movies
### From ChatGPT Prompt:
  
* Can you make me a function in python that will convert my pandas data frame to one-hot encode my users and movies?


This did not work as there are too many user/movie combinations. Needed to look for an alternative to one hot encoding all users and movies.

In [5]:
from sklearn.preprocessing import OneHotEncoder

def one_hot_encode_users_and_movies(df):
  # Initialize OneHotEncoder for users and movies with sparse output as False to return a dense matrix
  user_encoder = OneHotEncoder(sparse_output=False)
  movie_encoder = OneHotEncoder(sparse_output=False)

  # Fit and transform User_ID and Movie_ID
  user_encoded = user_encoder.fit_transform(df[['User_ID']])
  movie_encoded = movie_encoder.fit_transform(df[['Movie_ID']])

  # Convert encoded arrays to DataFrames
  user_df = pd.DataFrame(user_encoded, columns=[f'User_{int(i)}' for i in range(user_encoded.shape[1])])
  movie_df = pd.DataFrame(movie_encoded, columns=[f'Movie_{int(i)}' for i in range(movie_encoded.shape[1])])

  # Concatenate the one-hot encoded columns with the original DataFrame
  df_encoded = pd.concat([user_df, movie_df, df.drop(['User_ID', 'Movie_ID'], axis=1)], axis=1)

  return df_encoded

# Embeddings for Users and Movies

In [6]:
print(train.columns)

Index(['User_ID', 'Rating', 'Movie_ID', 'Year', 'runtimeMinutes', 'movie',
       'short', 'tvEpisode', 'tvMiniSeries', 'tvMovie', 'tvSeries', 'tvShort',
       'tvSpecial', 'video', 'Action', 'Adult', 'Adventure', 'Animation',
       'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family',
       'Fantasy', 'Film-Noir', 'Game-Show', 'History', 'Horror', 'Music',
       'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi',
       'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western'],
      dtype='object')


In [15]:
import tensorflow as tf
from tensorflow.keras.initializers import HeNormal, GlorotNormal
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Normalize user IDs and movie IDs
user_ids = pd.Series(user_ids)  # Convert numpy array to pandas Series
movie_ids = pd.Series(movie_ids)  # Convert numpy array to pandas Series
user_id_mapping = {user_id: idx for idx, user_id in enumerate(user_ids.unique())}
movie_id_mapping = {movie_id: idx for idx, movie_id in enumerate(movie_ids.unique())}

# Re-index user and movie IDs
user_ids_reindexed = user_ids.map(user_id_mapping)
movie_ids_reindexed = movie_ids.map(movie_id_mapping)

# Normalize numeric features
numeric_features = train[['Year', 'runtimeMinutes']]
scaler = StandardScaler()
numeric_features_scaled = scaler.fit_transform(numeric_features)

# Binary features (already 0/1 in the dataset)
binary_features = train[['Adult', 'movie', 'short', 'tvEpisode', 'tvMiniSeries',
                         'tvMovie', 'tvSeries', 'tvSpecial', 'video', 'Action',
                         'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
                         'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir',
                         'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News',
                         'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport',
                         'Talk-Show', 'Thriller', 'War', 'Western']]

# Concatenate scaled numeric and binary features
other_features = pd.concat([pd.DataFrame(numeric_features_scaled, columns=['Year', 'runtimeMinutes']), binary_features], axis=1).fillna(0)
other_features_input = tf.keras.layers.Input(shape=(other_features.shape[1],), name='OtherFeatures')

# Normalize ratings
scaler = MinMaxScaler(feature_range=(0, 1))
ratings_normalized = scaler.fit_transform(ratings.reshape(-1, 1))

# Debugging Layer
class DebugLayer(tf.keras.layers.Layer):
    def call(self, inputs):
        tf.debugging.check_numerics(inputs, "Found NaN or Inf in layer!")
        return inputs

# Define the input layers for User_ID and Movie_ID
user_input = tf.keras.layers.Input(shape=(1,), name='User')
movie_input = tf.keras.layers.Input(shape=(1,), name='Movie')

# Embedding layers with smaller dimensionality
user_embedding = tf.keras.layers.Embedding(input_dim=len(user_id_mapping), output_dim=16, embeddings_initializer=GlorotNormal())(user_input)
movie_embedding = tf.keras.layers.Embedding(input_dim=len(movie_id_mapping), output_dim=16, embeddings_initializer=GlorotNormal())(movie_input)

# Flatten the embeddings and debug
user_vec = DebugLayer()(tf.keras.layers.Flatten()(user_embedding))
movie_vec = DebugLayer()(tf.keras.layers.Flatten()(movie_embedding))

# Concatenate embeddings and other features
concat = DebugLayer()(tf.keras.layers.Concatenate()([user_vec, movie_vec, other_features_input]))

# Add dense layers with batch normalization and LeakyReLU activation
x = tf.keras.layers.BatchNormalization()(concat)
x = tf.keras.layers.Dense(64)(x)
x = tf.keras.layers.LeakyReLU(alpha=0.1)(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dense(32)(x)
x = tf.keras.layers.LeakyReLU(alpha=0.1)(x)

# Output layer
output = tf.keras.layers.Dense(1)(x)

# Define and compile the model
model = tf.keras.models.Model(inputs=[user_input, movie_input, other_features_input], outputs=output)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4, clipnorm=1.0), loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])

# Debugging callback
class CheckNaNCallback(tf.keras.callbacks.Callback):
    def on_batch_end(self, batch, logs=None):
        if np.any(np.isnan(logs['loss'])):
            print(f"NaN detected at batch {batch}")
            self.model.stop_training = True

# Train the model
model.fit([user_ids_reindexed, movie_ids_reindexed, other_features],
          ratings_normalized,
          batch_size=128,
          epochs=20,
          validation_split=0.1,
          callbacks=[CheckNaNCallback()])

Epoch 1/20




[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - loss: 1.0919 - root_mean_squared_error: 1.0430 - val_loss: 0.3681 - val_root_mean_squared_error: 0.6067
Epoch 2/20
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.3910 - root_mean_squared_error: 0.6243 - val_loss: 0.3070 - val_root_mean_squared_error: 0.5541
Epoch 3/20
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1987 - root_mean_squared_error: 0.4456 - val_loss: 0.2709 - val_root_mean_squared_error: 0.5205
Epoch 4/20
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.1255 - root_mean_squared_error: 0.3542 - val_loss: 0.2511 - val_root_mean_squared_error: 0.5011
Epoch 5/20
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0845 - root_mean_squared_error: 0.2907 - val_loss: 0.2359 - val_root_mean_squared_error: 0.4857
Epoch 6/20
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x7cd51de70a90>