In [None]:
import sys
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
import numpy as np


# add root folder to path
folder = "../../"
sys.path.append(folder)
from src.utils import load_data

users_df, ratings_df, movies_df = load_data('../../data/ml-1m')

movies_df.columns = ['MovieID', 'Title', 'Genres']
ratings_df.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
users_df.columns = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']
ratings_movies_df = pd.merge(ratings_df, movies_df, on='MovieID')
merged_df = pd.merge(ratings_movies_df, users_df, on='UserID')

merged_df['Timestamp'] = pd.to_datetime(merged_df['Timestamp'], unit='s')

merged_df = merged_df.sort_values(by=['UserID', 'Timestamp'])

# Encode MovieID
movie_encoder = LabelEncoder()
merged_df['MovieID_encoded'] = movie_encoder.fit_transform(merged_df['MovieID'])

# Prepare sequences
user_sequences = merged_df.groupby('UserID')['MovieID_encoded'].apply(list).tolist()

max_seq_length = 10
padded_sequences = pad_sequences(user_sequences, maxlen=max_seq_length, padding='pre')

X, y = [], []
for seq in padded_sequences:
    for i in range(1, len(seq)):
        X.append(seq[:i])
        y.append(seq[i])

X = pad_sequences(X, maxlen=max_seq_length, padding='pre')
y = np.array(y)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

num_movies = merged_df['MovieID_encoded'].nunique()

# Define the RNN model
model = Sequential()
model.add(Embedding(input_dim=num_movies, output_dim=50))
model.add(SimpleRNN(50))
model.add(Dense(num_movies, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")


Epoch 1/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.0031 - loss: 7.7358 - val_accuracy: 0.0047 - val_loss: 7.3780
Epoch 2/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.0049 - loss: 7.2724 - val_accuracy: 0.0052 - val_loss: 7.2429
Epoch 3/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.0072 - loss: 7.0515 - val_accuracy: 0.0098 - val_loss: 7.0722
Epoch 4/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.0096 - loss: 6.8164 - val_accuracy: 0.0116 - val_loss: 6.9754
Epoch 5/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.0145 - loss: 6.6160 - val_accuracy: 0.0132 - val_loss: 6.9203
Epoch 6/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.0190 - loss: 6.4662 - val_accuracy: 0.0149 - val_loss: 6.8917
Epoch 7/50
[1m680/680[0m 