# Neural Collaborative Filtering

In [33]:
import urllib.request

import numpy as np
import pandas as pd
import kagglehub, os


from keras.models import Model
from keras.layers import Embedding, Flatten, Input, Dense, Concatenate, Dot

In [34]:
path = kagglehub.dataset_download("ruchi798/bookcrossing-dataset")
print("Path to dataset files:", path)

Path to dataset files: /Users/dani22/.cache/kagglehub/datasets/ruchi798/bookcrossing-dataset/versions/3


In [35]:
book_reviews_path = os.path.join(path, 'Book reviews', 'Book reviews')

book_ratings_path = os.path.join(book_reviews_path, 'BX-Book-Ratings.csv')
books_path = os.path.join(book_reviews_path, 'BX_Books.csv')
users_path = os.path.join(book_reviews_path, 'BX-Users.csv')

data_path = os.path.join(path, 'Books Data with Category Language and Summary', 'Preprocessed_data.csv')

In [36]:
book_ratings_df = pd.read_csv(book_ratings_path, sep=';', encoding='latin-1')
book_ratings_df

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [37]:
books_df = pd.read_csv(books_path, sep=';', encoding='latin-1')
books_df

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...,...,...
271374,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...
271375,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...
271376,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...
271377,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...


In [38]:
users_df = pd.read_csv(users_path, sep=';', encoding='latin-1')
users_df

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",
...,...,...,...
278853,278854,"portland, oregon, usa",
278854,278855,"tacoma, washington, united kingdom",50.0
278855,278856,"brampton, ontario, canada",
278856,278857,"knoxville, tennessee, usa",


In [39]:
assert users_df['User-ID'].duplicated().sum() == 0
assert users_df['User-ID'].is_monotonic_increasing

In [40]:
book_ratings_df['User-ID'] -= 1
users_df['User-ID'] -= 1

In [41]:
users_df.index.rename('User-ID', inplace=True)
users_df.drop(columns='User-ID', inplace=True)
users_df

Unnamed: 0_level_0,Location,Age
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"nyc, new york, usa",
1,"stockton, california, usa",18.0
2,"moscow, yukon territory, russia",
3,"porto, v.n.gaia, portugal",17.0
4,"farnborough, hants, united kingdom",
...,...,...
278853,"portland, oregon, usa",
278854,"tacoma, washington, united kingdom",50.0
278855,"brampton, ontario, canada",
278856,"knoxville, tennessee, usa",


In [42]:
ISBN_to_index = pd.Series(books_df.index, index=books_df['ISBN'])
ISBN_to_index

ISBN
0195153448         0
0002005018         1
0060973129         2
0374157065         3
0393045218         4
               ...  
0440400988    271374
0525447644    271375
006008667X    271376
0192126040    271377
0767409752    271378
Length: 271379, dtype: int64

In [43]:
nonexistent_bookratings = pd.isna(book_ratings_df.ISBN.map(ISBN_to_index))
book_ratings_df = book_ratings_df[~nonexistent_bookratings]

In [44]:
book_ratings_df['book_index'] = book_ratings_df.ISBN.map(ISBN_to_index)
book_ratings_df.drop(columns='ISBN', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_ratings_df['book_index'] = book_ratings_df.ISBN.map(ISBN_to_index)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_ratings_df.drop(columns='ISBN', inplace=True)


In [45]:
book_ratings_df

Unnamed: 0,User-ID,Book-Rating,book_index
0,276724,0,2966
1,276725,5,225829
2,276726,0,11054
3,276728,3,246854
4,276728,6,246855
...,...,...,...
1149774,276703,0,69546
1149775,276703,9,69547
1149776,276705,0,52543
1149777,276708,10,15979


In [46]:
NUM_USERS = users_df.shape[0]
NUM_ITEMS = books_df.shape[0]

NUM_USERS, NUM_ITEMS

(278858, 271379)

In [47]:
book_ratings_df

Unnamed: 0,User-ID,Book-Rating,book_index
0,276724,0,2966
1,276725,5,225829
2,276726,0,11054
3,276728,3,246854
4,276728,6,246855
...,...,...,...
1149774,276703,0,69546
1149775,276703,9,69547
1149776,276705,0,52543
1149777,276708,10,15979


In [49]:
from sklearn.model_selection import train_test_split

# Entradas y salidas
user_ids = book_ratings_df['User-ID'].values
book_ids = book_ratings_df['book_index'].values
ratings = book_ratings_df['Book-Rating'].values

# Combinar user y book en una sola matriz (n_samples, 2)
X = np.stack([user_ids, book_ids], axis=1)  # shape: (n, 2)
y = ratings  # shape: (n,)

# División de datos
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=33
)

# Separar de nuevo para alimentar el modelo
X_train = [X_train[:, 0], X_train[:, 1]]
X_test = [X_test[:, 0], X_test[:, 1]]


In [57]:
latent_dim = 5
epochs = 5

### Modelo básico usando GMF

In [58]:
user_input = Input(shape=[1], name='user_input')
user_embedding = Embedding(NUM_USERS, latent_dim, name='user_embedding')(user_input)
user_vec = Flatten()(user_embedding)

item_input = Input(shape=[1], name='book_input')
item_embedding = Embedding(NUM_ITEMS, latent_dim, name='book_embedding')(item_input)
item_vec = Flatten()(item_embedding)

output = Dot(axes=1)([user_vec, item_vec])

In [59]:
# Model

GMF = Model([user_input, item_input], output)
GMF.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
GMF.summary()

In [60]:
GMF.fit(X_train, y_train, epochs=epochs, verbose=1, validation_split=0.1)

Epoch 1/5
[1m23202/23202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 7ms/step - loss: 22.9666 - mae: 2.8472 - val_loss: 22.6146 - val_mae: 2.8461
Epoch 2/5
[1m23202/23202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 7ms/step - loss: 22.1724 - mae: 2.8413 - val_loss: 21.6355 - val_mae: 2.8721
Epoch 3/5
[1m23202/23202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 7ms/step - loss: 20.5351 - mae: 2.8069 - val_loss: 20.9175 - val_mae: 2.8824
Epoch 4/5
[1m23202/23202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 7ms/step - loss: 19.0072 - mae: 2.7375 - val_loss: 20.4629 - val_mae: 2.8904
Epoch 5/5
[1m23202/23202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 8ms/step - loss: 17.5809 - mae: 2.6506 - val_loss: 20.1790 - val_mae: 2.9003


<keras.src.callbacks.history.History at 0x16686b740>

In [None]:
# Podría correrlo con más epochs pero no va a mejorar tanto como si nos centramos en MLP

In [61]:
y_pred = GMF.predict(X_test)

[1m6445/6445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 268us/step


In [62]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae:.4f}")

MAE: 2.8973


## Multi Layer Perceptron (MLP)


### Primer Modelo

In [63]:
latent_dim = 10
epochs = 10

In [None]:
# Inputs
user_input = Input(shape=(1,))
item_input = Input(shape=(1,))

# Embeddings
user_embedding = Embedding(NUM_USERS, latent_dim)(user_input)
item_embedding = Embedding(NUM_ITEMS, latent_dim)(item_input)

# Flatten
user_vec = Flatten()(user_embedding)
item_vec = Flatten()(item_embedding)

# Concatenar
concat = Concatenate(axis=1)([user_vec, item_vec])

x = Dense(20, activation='relu')(concat)
# x = Dropout(0.3)(x) Podríamos usarlo para evitar overfitting
x = Dense(10, activation='relu')(x)
# x = Dropout(0.3)(x)
x = Dense(5, activation='relu')(x)


output = Dense(1, activation='linear')(x)  # Usamos 'linear' porque ratings no están entre 0-1

In [65]:
from keras.optimizers import Adam

MLP = Model(inputs=[user_input, item_input], outputs=output)
MLP.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error', metrics=['mae'])
MLP.summary()
MLP.fit(X_train, y_train, epochs=epochs, batch_size=256, verbose=1, validation_split=0.1)

Epoch 1/10
[1m2901/2901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 16ms/step - loss: 13.4535 - mae: 2.8959 - val_loss: 11.3192 - val_mae: 2.7245
Epoch 2/10
[1m2901/2901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 16ms/step - loss: 9.4011 - mae: 2.3231 - val_loss: 11.8361 - val_mae: 2.6943
Epoch 3/10
[1m2901/2901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 16ms/step - loss: 7.7849 - mae: 1.9710 - val_loss: 12.6182 - val_mae: 2.6911
Epoch 4/10
[1m2901/2901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 16ms/step - loss: 6.6723 - mae: 1.7140 - val_loss: 12.9815 - val_mae: 2.7246
Epoch 5/10
[1m2901/2901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 17ms/step - loss: 6.0087 - mae: 1.5679 - val_loss: 13.5795 - val_mae: 2.7700
Epoch 6/10
[1m2901/2901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 16ms/step - loss: 5.5080 - mae: 1.4610 - val_loss: 13.7840 - val_mae: 2.7539
Epoch 7/10
[1m2901/2901[0m [32m━━━━━━━━━━━━━━━━━━━━[0

<keras.src.callbacks.history.History at 0x167ef94f0>

In [66]:
y_pred = MLP.predict(X_test)

[1m6445/6445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 297us/step


In [None]:
from sklearn.metrics import mean_absolute_error
print("MAE:", mean_absolute_error(y_test, y_pred))

MAE: 2.821961730230036


### Segundo Modelo

In [68]:
latent_dim = 5
epochs = 5

In [72]:
from tensorflow.keras.layers import Dropout

user_input2 = Input(shape=(1,))
item_input2 = Input(shape=(1,))

# Embeddings
user_embedding2 = Embedding(NUM_USERS, latent_dim)(user_input2)
item_embedding2 = Embedding(NUM_ITEMS, latent_dim)(item_input2)

# Flatten
user_vec2 = Flatten()(user_embedding2)
item_vec2 = Flatten()(item_embedding2)

# Concatenar
concat2 = Concatenate(axis=1)([user_vec2, item_vec2])

x2 = Dense(32, activation='relu')(concat2)
x2 = Dropout(0.3)(x2) # Evitar overfitting
x2 = Dense(16, activation='relu')(x2)
x2 = Dropout(0.3)(x2)
x2 = Dense(8, activation='relu')(x2)

output2 = Dense(1, activation='linear')(x2)  # Usamos 'linear' porque ratings no están entre 0-1

In [73]:
MLP = Model(inputs=[user_input2, item_input2], outputs=output2)
MLP.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error', metrics=['mae'])
MLP.summary()
MLP.fit(X_train, y_train, epochs=epochs, batch_size=256, verbose=1, validation_split=0.1)

Epoch 1/5
[1m2901/2901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 8ms/step - loss: 13.6090 - mae: 2.9667 - val_loss: 11.3624 - val_mae: 2.7866
Epoch 2/5
[1m2901/2901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 8ms/step - loss: 9.9388 - mae: 2.4454 - val_loss: 11.7762 - val_mae: 2.7934
Epoch 3/5
[1m2901/2901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 8ms/step - loss: 8.3689 - mae: 2.1128 - val_loss: 12.4287 - val_mae: 2.8130
Epoch 4/5
[1m2901/2901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 8ms/step - loss: 7.4320 - mae: 1.9031 - val_loss: 12.8670 - val_mae: 2.8345
Epoch 5/5
[1m2901/2901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 8ms/step - loss: 6.7302 - mae: 1.7411 - val_loss: 13.3054 - val_mae: 2.8671


<keras.src.callbacks.history.History at 0x3525dcc50>

In [74]:
y_pred = MLP.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))

[1m6445/6445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 308us/step
MAE: 2.853543027529882


### Tercer Modelo

In [75]:
# Usamos GMF + MLP

In [86]:
latent_dim = 7
epochs = 4

In [87]:
from tensorflow.keras.layers import Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2

# Normalizamos las puntuaciones entre 0 y 1
y_train_norm = y_train / 10.0
y_test_norm = y_test / 10.0

user_input = Input(shape=(1,))
item_input = Input(shape=(1,))

# Embeddings
user_embedding = Embedding(NUM_USERS, latent_dim)(user_input)
item_embedding = Embedding(NUM_ITEMS, latent_dim)(item_input)

user_vec = Flatten()(user_embedding)
item_vec = Flatten()(item_embedding)

# GMF Branch
gmf_vec = Dot(axes=1)([user_vec, item_vec])

# MLP Branch (concatenación + capas densas)
mlp_input = Concatenate()([user_vec, item_vec])
mlp_dense = Dense(64, activation='relu', kernel_regularizer=l2(0.001))(mlp_input)
mlp_dense = BatchNormalization()(mlp_dense)
mlp_dense = Dropout(0.4)(mlp_dense)
mlp_dense = Dense(32, activation='relu', kernel_regularizer=l2(0.001))(mlp_dense)
mlp_dense = BatchNormalization()(mlp_dense)

# Fusionar GMF + MLP
fusion = Concatenate()([gmf_vec, mlp_dense])
output = Dense(1, activation='sigmoid')(fusion)

# Modelo final
model = Model(inputs=[user_input, item_input], outputs=output)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()

In [88]:
model.fit([X_train[0], X_train[1]], y_train_norm, epochs=epochs, verbose=1)

Epoch 1/4
[1m25780/25780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 12ms/step - loss: 0.1421 - mae: 0.3047
Epoch 2/4
[1m25780/25780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m309s[0m 12ms/step - loss: 0.1054 - mae: 0.2541
Epoch 3/4
[1m25780/25780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m305s[0m 12ms/step - loss: 0.0941 - mae: 0.2287
Epoch 4/4
[1m25780/25780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m303s[0m 12ms/step - loss: 0.0826 - mae: 0.2021


<keras.src.callbacks.history.History at 0x386e87380>

In [89]:
y_pred_norm = model.predict([X_test[0], X_test[1]])
y_pred = y_pred_norm.flatten() * 10  # Reescalar las predicciones
y_test_orig = y_test_norm * 10       # Reescalar el test real

[1m6445/6445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 335us/step


In [90]:
mae = mean_absolute_error(y_test_orig, y_pred)
print(f"Mean Absolute Error: {mae:.4f}")

Mean Absolute Error: 2.6603
