In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers as keras_layers, Model, losses, optimizers, metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD
from math import sqrt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Load MovieLens 20M dataset
ratings_df = pd.read_csv(r'C:\Users\GARVIT\Downloads\archive\rating.csv')
movies_df = pd.read_csv(r'C:\Users\GARVIT\Downloads\archive\movie.csv')
tags_df = pd.read_csv(r'C:\Users\GARVIT\Downloads\archive\tag.csv')

# Merge ratings, movies, and tags dataframes
data_df = pd.merge(ratings_df, movies_df[['movieId', 'title', 'genres']], left_on='movieId', right_on='movieId')
data_df = pd.merge(data_df, tags_df[['userId', 'movieId', 'tag']], left_on=['userId', 'movieId'], right_on=['userId', 'movieId'], how='left')

# Preprocess data
user_ids = data_df['userId'].unique()
movie_ids = data_df['movieId'].unique()

user_id_map = {user_id: index for index, user_id in enumerate(user_ids)}
movie_id_map = {movie_id: index for index, movie_id in enumerate(movie_ids)}

data_df['userId'] = data_df['userId'].map(user_id_map)
data_df['movieId'] = data_df['movieId'].map(movie_id_map)

# Extract movie genres and tags
mlb = MultiLabelBinarizer()
genre_features = mlb.fit_transform(data_df['genres'].str.split('|'))

tags_df = data_df.groupby('movieId')['tag'].apply(lambda x: ' '.join(x.fillna('').astype(str))).reset_index()
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tag_features = tfidf_vectorizer.fit_transform(tags_df['tag'])

# Reduce dimensionality of genre features
svd = TruncatedSVD(n_components=10, random_state=42)
genre_features_reduced = svd.fit_transform(genre_features)

# Reduce dimensionality of tag features
svd = TruncatedSVD(n_components=10, random_state=42)
tag_features_reduced = svd.fit_transform(tag_features)

# Split data into train and test sets
train_df, test_df = train_test_split(data_df, test_size=0.2, random_state=42)

In [32]:
# Generator function
def data_generator(df, genre_features, tag_features, batch_size=64):
    def generator():
        for i in range(0, len(df), batch_size):
            batch_df = df.iloc[i:i+batch_size]

            X = [
                batch_df['userId'].values.reshape(-1, 1), 
                batch_df['movieId'].values.reshape(-1, 1), 
                genre_features[batch_df['movieId']],
                tag_features[batch_df['movieId']]
            ]
            y = batch_df['rating'].values.astype('float32')

            yield tuple(X), y

    return tf.data.Dataset.from_generator(
        generator,
        output_signature=(
            (
                tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
                tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
                tf.TensorSpec(shape=(None, 10), dtype=tf.float32),
                tf.TensorSpec(shape=(None, 10), dtype=tf.float32)
            ),
            tf.TensorSpec(shape=(None,), dtype=tf.float32)
        )
    ).prefetch(tf.data.AUTOTUNE)


# Create NCF model
num_users = len(user_ids)
num_movies = len(movie_ids)
num_genres = genre_features_reduced.shape[1]
num_tags = tag_features_reduced.shape[1]

def create_ncf_model(num_users, num_movies, num_genres, num_tags, embed_size=64, layer_sizes=[64, 32, 16]):
    # Input layers
    user_input = keras_layers.Input(shape=(1,), name='user_input')
    movie_input = keras_layers.Input(shape=(1,), name='movie_input')
    genre_input = keras_layers.Input(shape=(num_genres,), name='genre_input')
    tag_input = keras_layers.Input(shape=(num_tags,), name='tag_input')
    
    # Embedding layers
    user_embedding = keras_layers.Embedding(input_dim=num_users, output_dim=embed_size)(user_input)
    movie_embedding = keras_layers.Embedding(input_dim=num_movies, output_dim=embed_size)(movie_input)
    
    # Flatten embedding layers
    user_flatten = keras_layers.Flatten()(user_embedding)
    movie_flatten = keras_layers.Flatten()(movie_embedding)
    
    # Concatenate embeddings, genre input, and tag input
    concat_features = keras_layers.Concatenate()([user_flatten, movie_flatten, genre_input, tag_input])
    
    # Fully connected layers
    for units in layer_sizes:
        concat_features = keras_layers.Dense(units, activation='relu')(concat_features)
    
    # Output layer
    output = keras_layers.Dense(1)(concat_features)
    
    # Create and compile model
    model = Model(inputs=[user_input, movie_input, genre_input, tag_input], outputs=output)
    model.compile(optimizer=optimizers.Adam(learning_rate=0.001), loss=losses.MeanSquaredError(), metrics=[metrics.RootMeanSquaredError()])
    
    return model


In [34]:
# Create and train NCF model using data generators
batch_size = 64
train_gen = data_generator(train_df, genre_features_reduced, tag_features_reduced, batch_size=batch_size)
steps_per_epoch = len(train_df) // batch_size

test_gen = data_generator(test_df, genre_features_reduced, tag_features_reduced, batch_size=batch_size)
validation_steps = len(test_df) // batch_size

# Create datasets using the data_generator function
train_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(train_df, genre_features_reduced, tag_features_reduced, batch_size=batch_size),
    output_signature=(
        (
            tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
            tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
            tf.TensorSpec(shape=(None, 10), dtype=tf.float32),
            tf.TensorSpec(shape=(None, 10), dtype=tf.float32)
        ),
        tf.TensorSpec(shape=(None,), dtype=tf.float32)
    )
).prefetch(tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(test_df, genre_features_reduced, tag_features_reduced, batch_size=batch_size),
    output_signature=(
        (
            tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
            tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
            tf.TensorSpec(shape=(None, 10), dtype=tf.float32),
            tf.TensorSpec(shape=(None, 10), dtype=tf.float32)
        ),
        tf.TensorSpec(shape=(None,), dtype=tf.float32)
    )
).prefetch(tf.data.AUTOTUNE)

# Create and train NCF model
ncf_model = create_ncf_model(num_users, num_movies, num_genres, num_tags)

ncf_model.fit(
    train_dataset,
    epochs=5,
    steps_per_epoch=steps_per_epoch,
    validation_data=test_dataset,
    validation_steps=validation_steps
)


# Predict ratings on test data
y_pred = ncf_model.predict(test_gen, steps=len(test_df) // batch_size).flatten()

# Calculate RMSE
rmse = sqrt(mean_squared_error(test_df['rating'].values, y_pred))
print(f'Test RMSE: {rmse}')

Epoch 1/5
[1m253320/253320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18311s[0m 72ms/step - loss: 0.7917 - root_mean_squared_error: 0.8877 - val_loss: 0.6853 - val_root_mean_squared_error: 0.8278
Epoch 2/5
[1m253320/253320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1us/step - loss: 0.1933 - root_mean_squared_error: 0.6218 - val_loss: 0.4234 - val_root_mean_squared_error: 0.9202
Epoch 3/5


  self.gen.throw(value)


[1m253320/253320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10051s[0m 40ms/step - loss: 0.6657 - root_mean_squared_error: 0.8159 - val_loss: 0.6612 - val_root_mean_squared_error: 0.8131
Epoch 4/5
[1m253320/253320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step - loss: 0.2381 - root_mean_squared_error: 0.6901 - val_loss: 0.3342 - val_root_mean_squared_error: 0.8176
Epoch 5/5
[1m253320/253320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17841s[0m 70ms/step - loss: 0.6246 - root_mean_squared_error: 0.7903 - val_loss: 0.6542 - val_root_mean_squared_error: 0.8088
[1m63330/63330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 1ms/step


ValueError: Found input variables with inconsistent numbers of samples: [4053125, 4053120]

In [37]:
# Predict ratings on test data
y_pred = ncf_model.predict(test_gen, steps=len(test_df) // batch_size).flatten()

# Calculate RMSE
rmse = sqrt(mean_squared_error(test_df['rating'].values[:len(y_pred)], y_pred))
print(f'Test RMSE: {rmse}')


[1m63330/63330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 1ms/step
Test RMSE: 0.8088421623179642
