# Sistema Basados en Contenido

In [2]:
#importar los modulos o librerias
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm 

In [3]:
# constantes
PATH = 'data.csv'

#### Importar Datos

In [4]:
#Se cargan los datos, con la funcion shape 
#encontrar la dimension de un array
df = pd.read_csv(PATH)
df.shape

(100000, 10)

In [5]:
#Se presentan los primeros 5 valores del dataframe
df.head()

Unnamed: 0,book_id,author_id,book_genre,reader_id,num_pages,book_rating,publisher_id,publish_year,book_price,text_lang
0,655,52,4,11482,300,4,8,2012,94,7
1,2713,90,3,6479,469,1,8,2012,33,5
2,409,17,2,25472,435,1,12,2001,196,4
3,1150,234,10,23950,529,2,23,2019,79,2
4,2424,390,5,13046,395,2,20,2010,200,4


### Recomendación de libros

In [6]:
def normalize(data):
    '''
    Esta función normalizará los datos de entrada para que estén entre 0 y 1
    
    parametros:
        data (List) : La lista de valores a normalizar
    
    returns:
        Los datos de entrada normalizados entre 0 y 1
    '''
    min_val = min(data)
    if min_val < 0:
        data = [x + abs(min_val) for x in data]
    max_val = max(data)
    return [x/max_val for x in data]

In [7]:
# normaliza las columnas num_pages, ratings, price columns
df['num_pages_norm'] = normalize(df['num_pages'].values)
df['book_rating_norm'] = normalize(df['book_rating'].values)
df['book_price_norm'] = normalize(df['book_price'].values)

In [8]:
#Esta función codificará en caliente la columna especificada y la volverá a agregar
#en el marco de datos de entrada
def ohe(df, enc_col):
    '''
    params:
        df (DataFrame) : El marco de datos al que desea que se agreguen los resultados
        enc_col (String) : La columna que quieres OHE
    
    returns:
        Las columnas OHE agregadas al marco de datos de entrada
    '''
    
    ohe_df = pd.get_dummies(df[enc_col])
    ohe_df.reset_index(drop = True, inplace = True)
    return pd.concat([df, ohe_df], axis = 1)

In [9]:
# OHE en publicación_año y género
df = ohe(df = df, enc_col = 'publish_year')
df = ohe(df = df, enc_col = 'book_genre')
df = ohe(df = df, enc_col = 'text_lang')

In [10]:
# descartar columnas redundantes
cols = ['publish_year', 'book_genre', 'num_pages', 'book_rating', 'book_price', 'text_lang']
df.drop(columns = cols, inplace = True)
df.set_index('book_id', inplace = True)

In [11]:
class CBRecommend():
    def __init__(self, df):
        self.df = df
        
    def cosine_sim(self, v1,v2):
        '''
        Esta función calculará la similitud del coseno entre dos vectores.
        '''
        return dot(v1,v2)/(norm(v1)*norm(v2))
    
    def recommend(self, book_id, n_rec):
        """
        df (dataframe): El dataframe
        song_id (string): Representa el nombre de la canción
        n_rec (int): cantidad de rec que el usuario quiere
        """
        
        # calcular la similitud del vector book_id de entrada con todos los demás vectores
        inputVec = self.df.loc[book_id].values
        self.df['sim']= self.df.apply(lambda x: self.cosine_sim(inputVec,x.values), axis=1)
        
        # devuelve los primeros n libros especificados por el usuario
        return self.df.nlargest(columns='sim',n=n_rec)

In [12]:
#Se ejecuta una muestra con un ejemplo
t = df.sample(1000).copy()
cbr = CBRecommend(df = t)

In [13]:
#Se muestra el dataframe de los libros con los datos correspondientes
cbr.df.head()

Unnamed: 0_level_0,author_id,reader_id,publisher_id,num_pages_norm,book_rating_norm,book_price_norm,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1
408,23,19099,29,0.814286,0.8,0.97,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
540,54,28105,28,0.684286,0.1,0.805,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
2559,312,17633,49,0.854286,0.5,0.28,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
1638,332,16706,6,0.387143,0.5,0.935,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
2547,9,8749,9,0.191429,0.2,0.845,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0


In [14]:
#Se mueestran los n libros especificados por el usuario
print(cbr.recommend(book_id = t.index[0], n_rec = 5))

         author_id  reader_id  publisher_id  num_pages_norm  ...  5  6  7  sim
book_id                                                      ...              
408             23      19099            29        0.814286  ...  0  0  0  1.0
1439            15      13605            22        0.178571  ...  0  1  0  1.0
2608            28      26143            36        0.910000  ...  1  0  0  1.0
2199            28      24322            32        0.985714  ...  0  0  0  1.0
1091            32      26564            48        0.707143  ...  1  0  0  1.0

[5 rows x 46 columns]
