In [1]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

In [2]:
# Cargar los datasets
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

# Mostrar las primeras filas del dataset para entender la estructura
train_df.head()

Unnamed: 0,book_title,book_price,rating,reviewer,reviewer_rating,is_verified,genre_ Action,genre_ Activities,genre_ Adult,genre_ Adventure,...,were,what.1,when,which,who,will.1,with.1,would,you.1,your.1
0,Just Because,0.071516,-0.554055,A. Slater,5,1,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.222732,0.0,0.0,0.0,0.0
1,Just Because,0.071516,-0.554055,A. Slater,5,1,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.222732,0.0,0.0,0.0,0.0
2,Just Because,0.071516,-0.554055,A. Slater,5,1,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.222732,0.0,0.0,0.0,0.0
3,"Goodnight, Goodnight Construction Site (Board ...",-1.13658,1.358517,Margaret Zahalka,5,1,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Goodnight, Goodnight Construction Site (Board ...",-1.13658,1.358517,Margaret Zahalka,5,1,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Crear la matriz usuario-libro a partir de train_df
user_book_matrix = train_df.pivot_table(index='reviewer', columns='book_title', values='reviewer_rating')

# Calcular el promedio de calificaciones por cada libro
book_rating_means = user_book_matrix.mean()

# Llenar los valores faltantes con el promedio de las calificaciones del libro
user_book_matrix_filled = user_book_matrix.apply(lambda row: row.fillna(book_rating_means))

# Obtener las columnas que representan los géneros
genre_columns = [col for col in train_df.columns if col.startswith('genre_')]

# Crear una matriz que combine los géneros de los libros
book_genres_matrix = train_df.groupby('book_title')[genre_columns].first()

# Concatenar la matriz de géneros con la matriz de calificaciones de libros
user_book_genre_matrix_filled = pd.concat([user_book_matrix_filled, book_genres_matrix], axis=1)

# Revisar las primeras filas de la nueva matriz
user_book_genre_matrix_filled.head()

Unnamed: 0,"A Court of Mist and Fury (A Court of Thorns and Roses, 2)","A Court of Thorns and Roses (A Court of Thorns and Roses, 1)",A Court of Thorns and Roses Paperback Box Set (5 books),"A Court of Wings and Ruin (A Court of Thorns and Roses, 3)",A Little Life,All the Light We Cannot See: A Novel,Atomic Habits: An Easy & Proven Way to Build Good Habits & Break Bad Ones,"Brown Bear, Brown Bear, What Do You See?",Chicka Chicka Boom Boom (Board Book),Demon Copperhead: A Pulitzer Prize Winner,...,genre_Nonfiction,genre_Personal Finance,genre_Picture Book,genre_Picture Books,genre_Romance,genre_Self Help,genre_Self-improvement,genre_Spiritual Warfare,genre_Spirituality,genre_Thriller
A H Kobayashi,,,,,,,,,,,...,,,,,,,,,,
A. K. P.,,,,,,,,,,,...,,,,,,,,,,
A. Slater,,,,,,,,,,,...,,,,,,,,,,
A.B.318,,,,,,,,,,,...,,,,,,,,,,
A.S.,,,,,,,,,,,...,,,,,,,,,,


In [4]:
# Asegurarse de que no haya NaN ni infinitos en la matriz
user_book_genre_matrix_filled = user_book_genre_matrix_filled.replace([np.inf, -np.inf], np.nan)  # Reemplazar infs por NaN
user_book_genre_matrix_filled = user_book_genre_matrix_filled.fillna(0)  # Reemplazar NaNs por 0

# Asegurarse de que los valores son de tipo float
user_book_genre_matrix_filled = user_book_genre_matrix_filled.astype(float)

# Aplicar SVD
U, sigma, Vt = svds(user_book_genre_matrix_filled.values, k=50)

# Convertir sigma a una matriz diagonal
sigma = np.diag(sigma)

# Reconstruir la matriz de calificaciones
predicted_ratings_matrix = np.dot(np.dot(U, sigma), Vt)

# Convertir la matriz reconstruida en un DataFrame
predicted_ratings_df = pd.DataFrame(predicted_ratings_matrix, 
                                    index=user_book_genre_matrix_filled.index, 
                                    columns=user_book_genre_matrix_filled.columns)

# Mostrar las primeras filas de las predicciones
predicted_ratings_df.head()

Unnamed: 0,"A Court of Mist and Fury (A Court of Thorns and Roses, 2)","A Court of Thorns and Roses (A Court of Thorns and Roses, 1)",A Court of Thorns and Roses Paperback Box Set (5 books),"A Court of Wings and Ruin (A Court of Thorns and Roses, 3)",A Little Life,All the Light We Cannot See: A Novel,Atomic Habits: An Easy & Proven Way to Build Good Habits & Break Bad Ones,"Brown Bear, Brown Bear, What Do You See?",Chicka Chicka Boom Boom (Board Book),Demon Copperhead: A Pulitzer Prize Winner,...,genre_Nonfiction,genre_Personal Finance,genre_Picture Book,genre_Picture Books,genre_Romance,genre_Self Help,genre_Self-improvement,genre_Spiritual Warfare,genre_Spirituality,genre_Thriller
A H Kobayashi,1.399516e-15,2.822698e-16,-1.270756e-15,1.020734e-15,6.16828e-17,1.161061e-30,3.458629e-17,1.310615e-15,-2.895927e-16,-1.651603e-31,...,7.536254e-31,5.742365e-32,-1.011808e-31,-1.944065e-31,-3.811088e-32,6.072956e-32,3.103294e-32,-1.148311e-31,-6.972519e-32,3.8331230000000003e-31
A. K. P.,0.0743492,-0.06367046,7.244335e-17,-0.09607008,-0.02400424,-1.959493e-16,5.168824e-16,-0.3117077,-0.07519223,-1.9708020000000003e-17,...,6.988332000000001e-17,-1.126544e-17,2.490298e-18,-5.793116e-19,4.949401e-18,-1.6103710000000003e-17,-2.934973e-18,1.218727e-17,1.705795e-18,-1.955307e-17
A. Slater,7.553439e-18,-1.005169e-16,-5.244143e-15,2.017932e-16,-3.907669e-17,-9.056838e-18,-2.364022e-15,1.072187e-15,3.681951e-16,1.050619e-17,...,5.859407e-18,-1.0525110000000001e-17,-1.3516590000000001e-17,2.069138e-17,1.5644e-17,-7.458719000000001e-17,-1.461191e-17,1.431483e-17,3.3001930000000004e-17,2.474723e-17
A.B.318,-0.001048089,-0.08738537,-2.305491e-16,-0.07781044,0.007557846,6.989007e-16,1.556079e-16,0.1214294,-0.08251057,5.862595e-16,...,-1.683413e-16,2.615857e-17,1.18841e-18,-8.551917000000001e-18,-1.111034e-17,8.669059e-17,1.4780110000000002e-17,-2.573022e-17,-3.223377e-17,-2.217165e-16
A.S.,-0.08940232,0.290463,-1.51953e-15,0.004253371,-0.2500913,-2.810551e-16,1.330744e-15,-0.002066735,-0.1103981,-6.429607e-16,...,-4.140033e-16,3.4459790000000004e-17,-4.7998620000000005e-17,1.133589e-16,1.4019840000000002e-17,1.0520160000000001e-17,-6.892431e-18,-4.545966e-17,-2.672037e-17,-2.981505e-16


In [5]:
# Seleccionar solo las columnas de los libros en la matriz de predicciones (excluyendo los géneros)
book_columns = user_book_matrix_filled.columns  # Columnas correspondientes a los libros
train_predicted_books = predicted_ratings_df[book_columns].reindex(user_book_matrix_filled.index).fillna(0).values

# Calcular RMSE
def rmse(predicted, actual):
    # Filtrar los valores que no son 0 en la matriz real
    predicted_flat = predicted[actual != 0]
    actual_flat = actual[actual != 0]
    return np.sqrt(mean_squared_error(actual_flat, predicted_flat))

# Obtener las calificaciones reales y predichas del conjunto de entrenamiento
train_actual = user_book_matrix_filled.values

# Calcular el RMSE en el conjunto de entrenamiento (solo libros)
print(f"RMSE del modelo en el conjunto de entrenamiento: {rmse(train_predicted_books, train_actual)}")

ValueError: Input contains NaN.

In [None]:
# Obtener las calificaciones reales y predichas del conjunto de prueba
test_user_book_matrix = test_df.pivot_table(index='reviewer', columns='book_title', values='reviewer_rating')
test_user_book_matrix_filled = test_user_book_matrix.fillna(0)
test_predicted = predicted_ratings_df.reindex(test_user_book_matrix_filled.index).fillna(0)[test_user_book_matrix_filled.columns].fillna(0).values

# Calcular el RMSE en el conjunto de prueba
test_actual = test_user_book_matrix_filled.values
print(f"RMSE del modelo en el conjunto de prueba: {rmse(test_predicted, test_actual)}")

In [None]:
# Histograma para comparar las calificaciones reales y las predichas en el conjunto de prueba
plt.figure(figsize=(10,6))
plt.hist(test_actual[test_actual != 0].flatten(), bins=50, alpha=0.5, label='Calificaciones Reales', color='blue')
plt.hist(test_predicted[test_actual != 0].flatten(), bins=50, alpha=0.5, label='Calificaciones Predichas', color='red')
plt.title('Distribución de Calificaciones: Reales vs Predichas (Conjunto de Prueba)')
plt.xlabel('Calificación')
plt.ylabel('Frecuencia')
plt.legend()
plt.show()

In [None]:
# Función para obtener recomendaciones para un usuario
def get_top_recommendations(user_id, original_ratings_matrix, predicted_ratings_matrix, num_recommendations=5):
    # Obtener los libros que el usuario ya ha calificado
    user_rated_books = original_ratings_matrix.loc[user_id].dropna().index.tolist()
    
    # Obtener las predicciones para ese usuario
    user_predictions = predicted_ratings_matrix.loc[user_id].sort_values(ascending=False)
    
    # Filtrar los libros que el usuario no ha calificado
    recommendations = user_predictions[~user_predictions.index.isin(user_rated_books)].head(num_recommendations)
    
    return recommendations

# Ejemplo de uso: Obtener recomendaciones para un usuario específico
example_user = "A H Kobayashi"
top_recommendations = get_top_recommendations(example_user, user_book_matrix, predicted_ratings_df)

# Mostrar las recomendaciones
print("Libros recomendados para A H Kobayashi:")
print(top_recommendations)