In [101]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.feature_extraction.text import CountVectorizer
import ast
import warnings

warnings.filterwarnings('ignore') 

THEMES = {
 'amitie': 373,
 'amour': 77,
 'autobiographie': 45,
 'aventure': 33,
 'bande-dessinee': 18,
 'biographie': 31,
 'cinema': 89,
 'classique': 28,
 'comedie-romantique': 3788,
 'comics': 142,
 'drogue': 863,
 'dystopie': 879,
 'emotion': 4100,
 'enquetes': 3988,
 'entretiens': 708,
 'essai': 13,
 'famille': 290,
 'fantastique': 7,
 'fantasy': 4,
 'geographie': 187,
 'guerre': 91,
 'humour': 15,
 'humour-noir': 621,
 'jeunesse': 14,
 'journalisme': 975,
 'litterature-americaine': 9,
 'litterature-asiatique': 1064,
 'litterature-francaise': 3,
 'manga': 12,
 'musique': 44,
 'nouvelles': 23,
 'peur': 430,
 'poesie': 25,
 'politique': 54,
 'psychologie': 65,
 'racisme': 906,
 'recit-de-voyage': 448,
 'religion': 26,
 'reportage': 3488,
 'reseaux-sociaux': 26109,
 'roman': 1,
 'roman-fantastique': 912,
 'roman-noir': 136,
 'romans-policiers-et-polars': 63883,
 'science-fiction': 6,
 'sentiments': 1770,
 'serie': 788,
 'theatre': 21,
 'thriller': 11,
 'thriller-psychologique': 1073,
 'tragedie': 601,
 'western': 533
}


# Merging files and datasets

In [102]:
def merge_data_files(books_file_list:list, books_meta_file=None):
    df_books = None

    # fusion of mains books data files with comments
    for filename in books_file_list:
        df_books_temp = pd.read_json(filename, lines=True)
        if df_books is None:
            df_books = df_books_temp
        else: df_books = pd.concat([df_books, df_books_temp])

    df_books = df_books.drop(['tags'],axis=1)

    # join to the meta data books file
    df_meta = pd.read_json(books_meta_file, lines=True)
    df_meta = df_meta.drop(['book_nb_comm', 'title', 'name', 'surname','img_url','book_date'],axis=1)
    df_books = df_books.merge(df_meta, on='book_id', how='inner')

    # todo : genre du profil per user_id
    return df_books

In [4]:
df_comm = merge_data_files(['../output/books-julien.json','../output/books-rebecca.json'],'../output/books-meta-data.json')

In [29]:
# group by per book
columns_book = ['book_id', 'book_url', 'book_nb_comm', 'title', 'name', 'surname',
       'tags', 'img_url', 'book_rating_count', 'book_rating_value',
       'book_author_url', 'book_editor', 'book_pages']
def reduce_comm_to_books(df):
    return df.copy().groupby(columns_book, as_index=False).count().loc[:,columns_book]

In [42]:
df_books = reduce_comm_to_books(df_comm)
df_books

Unnamed: 0,book_id,book_url,book_nb_comm,title,name,surname,tags,img_url,book_rating_count,book_rating_value,book_author_url,book_editor,book_pages
0,1497,https://www.babelio.com/livres/Kerouac-Sur-la-...,261,Sur la route,Jack,Kerouac,"[['roman', 22], ['classique', 18], ['autobiogr...",/couv/CVT_Sur-la-route_7854.jpg,3444,3.73,/auteur/Jack-Kerouac/1924,Gallimard,436
1,1499,https://www.babelio.com/livres/Calvino-Le-Baro...,122,Le Baron perché,Italo,Calvino,"[['roman', 23], ['fantastique', 15], ['roman h...",/couv/CVT_5953_1791546.jpg,1681,4.01,/auteur/Italo-Calvino/1927,Seuil,0
2,1508,https://www.babelio.com/livres/Guibert-Le-Phot...,57,"Le Photographe, tome 1",Emmanuel,Guibert,"[['journalisme', 15], ['documentaire', 15], ['...",/couv/CVT_11171_1289752.jpg,453,4.39,/auteur/Emmanuel-Guibert/1939,Dupuis,80
3,1523,https://www.babelio.com/livres/Zweig-Fouche/1523,75,Fouché,Stefan,Zweig,"[['roman', 15], ['historique', 15], ['essai', ...",/couv/CVT_Fouche_6481.jpeg,646,4.31,/auteur/Stefan-Zweig/1963,Le Livre de Poche,284
4,1526,https://www.babelio.com/livres/Zweig-Amok-ou-L...,157,Amok ou Le fou de Malaisie,Stefan,Zweig,"[['roman', 23], ['fantastique', 15], ['nouvell...",https://images-na.ssl-images-amazon.com/images...,1712,3.98,/auteur/Stefan-Zweig/1963,Le Livre de Poche,190
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4561,1462096,https://www.babelio.com/livres/-Les-chevaliers...,38,"Les chevaliers d'Emeraude, tome 5 : L'île des ...",Anne,Robillard,"[['aventure', 19], ['saga', 17], ['roman', 19]...",https://m.media-amazon.com/images/I/51+3H3yolR...,1110,3.82,/auteur/Anne-Robillard/7268,Michel Lafon,416
4562,1462133,https://www.babelio.com/livres/-Vampire-Academ...,98,"Vampire Academy, tome 2 : Morsure de glace",Richelle,Mead,"[['serie', 14], ['saga', 16], ['aventure', 15]...",/couv/CVT_Vampire-Academy--Morsure-de-glace_32...,656,4.26,/auteur/Richelle-Mead/65475,Castelmore,416
4563,1466000,https://www.babelio.com/livres/-Le-Nom-de-la-R...,338,Le Nom de la Rose,Umberto,Eco,"[['roman', 24], ['historique', 17], ['roman hi...",https://images-na.ssl-images-amazon.com/images...,4311,4.31,/auteur/Umberto-Eco/2529,Grasset,549
4564,1467600,https://www.babelio.com/livres/-A-la-recherche...,37,"A la recherche du temps perdu, tome 4 : Sodome...",Marcel,Proust,"[['roman', 24], ['classique', 22], ['autobiogr...",/couv/CVT_A-la-recherche-du-temps-perdu-tome-4...,521,4.33,/auteur/Marcel-Proust/2103,Gallimard,890


We have now 2 dataset : df_comm and df_books with 2 differents aggregate levels

# Preprocessing df_books

In [87]:
# on garde que les tags avec le nom dans filter_list ou si la valeur est supérieur à filter_force_min. ca permet de rétirer les tags rares et peu importants
def tags_to_cols(df, col_name, filter_list=None, filter_force_min=24):
    df1 = df.copy()

    for index,row in df.iterrows():
        tags_as_string = row[col_name]
        tags = ast.literal_eval(tags_as_string)

        for tag in tags:
            if filter_list is not None and not tag[0].strip() in filter_list and tag[1] < filter_force_min:
                continue
            
            tag_name = 'tag_'+tag[0].strip().replace(' ','_').lower()
            
            df1.loc[df1.index == index, tag_name] = tag[1]

    return df1

In [103]:
df_books = tags_to_cols(df_books, col_name='tags', filter_list=list(THEMES.keys()), filter_force_min=24)

In [104]:
df_books

Unnamed: 0,book_id,book_url,book_nb_comm,title,name,surname,tags,img_url,book_rating_count,book_rating_value,...,tag_asexualité,tag_aromantisme,tag_licorne,tag_alopécie,tag_héritage,tag_mystère,tag_destin,tag_gangs,tag_littérature_algérienne,tag_identité
0,1497,https://www.babelio.com/livres/Kerouac-Sur-la-...,261,Sur la route,Jack,Kerouac,"[['roman', 22], ['classique', 18], ['autobiogr...",/couv/CVT_Sur-la-route_7854.jpg,3444,3.73,...,,,,,,,,,,
1,1499,https://www.babelio.com/livres/Calvino-Le-Baro...,122,Le Baron perché,Italo,Calvino,"[['roman', 23], ['fantastique', 15], ['roman h...",/couv/CVT_5953_1791546.jpg,1681,4.01,...,,,,,,,,,,
2,1508,https://www.babelio.com/livres/Guibert-Le-Phot...,57,"Le Photographe, tome 1",Emmanuel,Guibert,"[['journalisme', 15], ['documentaire', 15], ['...",/couv/CVT_11171_1289752.jpg,453,4.39,...,,,,,,,,,,
3,1523,https://www.babelio.com/livres/Zweig-Fouche/1523,75,Fouché,Stefan,Zweig,"[['roman', 15], ['historique', 15], ['essai', ...",/couv/CVT_Fouche_6481.jpeg,646,4.31,...,,,,,,,,,,
4,1526,https://www.babelio.com/livres/Zweig-Amok-ou-L...,157,Amok ou Le fou de Malaisie,Stefan,Zweig,"[['roman', 23], ['fantastique', 15], ['nouvell...",https://images-na.ssl-images-amazon.com/images...,1712,3.98,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4561,1462096,https://www.babelio.com/livres/-Les-chevaliers...,38,"Les chevaliers d'Emeraude, tome 5 : L'île des ...",Anne,Robillard,"[['aventure', 19], ['saga', 17], ['roman', 19]...",https://m.media-amazon.com/images/I/51+3H3yolR...,1110,3.82,...,,,,,,,,,,
4562,1462133,https://www.babelio.com/livres/-Vampire-Academ...,98,"Vampire Academy, tome 2 : Morsure de glace",Richelle,Mead,"[['serie', 14], ['saga', 16], ['aventure', 15]...",/couv/CVT_Vampire-Academy--Morsure-de-glace_32...,656,4.26,...,,,,,,,,,,
4563,1466000,https://www.babelio.com/livres/-Le-Nom-de-la-R...,338,Le Nom de la Rose,Umberto,Eco,"[['roman', 24], ['historique', 17], ['roman hi...",https://images-na.ssl-images-amazon.com/images...,4311,4.31,...,,,,,,,,,,
4564,1467600,https://www.babelio.com/livres/-A-la-recherche...,37,"A la recherche du temps perdu, tome 4 : Sodome...",Marcel,Proust,"[['roman', 24], ['classique', 22], ['autobiogr...",/couv/CVT_A-la-recherche-du-temps-perdu-tome-4...,521,4.33,...,,,,,,,,,,


# Simple Model

In [109]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

class BookReco:
    def __init__(self):
        self.data = None
        self.vectors = []
        self.scalars = []
        self.predict_scores = None

    def add_vector(self, col_prefix, weight=1):
        self.vectors.append({'col_name': col_prefix, 'weight': weight})

    def add_scalar(self, col_name, weight=1):
        self.scalars.append({'col_name': col_name, 'weight': weight})

    def __get_cosine_similarity(self, prefix):
        data_temp = self.data.filter(regex=f'^{prefix}',axis=1)
        data_temp = data_temp.fillna(0)
        vec = MinMaxScaler().fit_transform(data_temp)
        return cosine_similarity(vec)

    def fit(self, data):
        self.data = data

        for i,vector in enumerate(self.vectors):
            feats_cs = self.__get_cosine_similarity(vector['col_name'])

            self.vectors[i] = {**vector, 'cosine_similar': feats_cs}

        for i,scalar in enumerate(self.scalars):
            X = self.data.loc[:,[scalar['col_name']]]
            X = MinMaxScaler().fit_transform(X)
            self.scalars[i] = {**scalar, 'scaled': X.reshape(-1)}

    def predict(self, book_id):
        scores = []
        index_book = self.data.query('book_id == @book_id').index.values.astype(int)[0]
        
        # get all scores, apply weight
        for vector in self.vectors:
            score = vector['cosine_similar'][index_book]
            scores.append(score*vector['weight'])
        
        for scalar in self.scalars:
            scores.append(scalar['scaled']*scalar['weight'])

        # sum of all scores
        self.predict_scores = None
        for score in scores:
            if self.predict_scores is None:
                self.predict_scores = np.array(score)
            else: self.predict_scores += np.array(score)

        # normalisation of sum of all scores
        self.predict_scores = MinMaxScaler().fit_transform(self.predict_scores.reshape(-1,1))
        return self.predict_scores

    def format_prediction(self, scores, max_books):
        scores = [(i,bi) for i,bi in enumerate(scores)]
        sorted_scores = sorted(scores, key=lambda x:x[1], reverse=True)
        sorted_scores = sorted_scores[1:max_books+1]
        return [(i, self.data.iloc[i,:]['book_id'],self.data.iloc[i,:]['title'],s) for i,s in sorted_scores]

In [110]:
br = BookReco()
br.add_vector('tag_', weight=10)
br.add_scalar('book_rating_value', weight=5)
br.add_scalar('book_nb_comm', weight=1)
br.fit(df_books)

In [111]:
scores = br.predict(1499) # le Baron Perché
br.format_prediction(scores, max_books=5)

[(15, 1689, "Si par une nuit d'hiver un voyageur", array([0.89015174])),
 (20, 1725, 'Le désert des Tartares', array([0.86871343])),
 (4536, 1436182, 'Le portrait de Dorian Gray', array([0.83112893])),
 (770, 36712, 'Le Petit Prince', array([0.78418679])),
 (491, 10782, 'Le K', array([0.77664584]))]