In [423]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.feature_extraction.text import CountVectorizer
import ast
import warnings

warnings.filterwarnings('ignore') 

pd.set_option("display.max_columns", 100) # ‘None’ value means unlimited.

THEMES = {
 'amitie': 373,
 'amour': 77,
 'autobiographie': 45,
 'aventure': 33,
 'bande-dessinee': 18,
 'biographie': 31,
 'cinema': 89,
 'classique': 28,
 'comedie-romantique': 3788,
 'comics': 142,
 'drogue': 863,
 'dystopie': 879,
 'emotion': 4100,
 'enquetes': 3988,
 'entretiens': 708,
 'essai': 13,
 'famille': 290,
 'fantastique': 7,
 'fantasy': 4,
 'geographie': 187,
 'guerre': 91,
 'humour': 15,
 'humour-noir': 621,
 'jeunesse': 14,
 'journalisme': 975,
 'litterature-americaine': 9,
 'litterature-asiatique': 1064,
 'litterature-francaise': 3,
 'manga': 12,
 'musique': 44,
 'nouvelles': 23,
 'peur': 430,
 'poesie': 25,
 'politique': 54,
 'psychologie': 65,
 'racisme': 906,
 'recit-de-voyage': 448,
 'religion': 26,
 'reportage': 3488,
 'reseaux-sociaux': 26109,
 'roman': 1,
 'roman-fantastique': 912,
 'roman-noir': 136,
 'romans-policiers-et-polars': 63883,
 'science-fiction': 6,
 'sentiments': 1770,
 'serie': 788,
 'theatre': 21,
 'thriller': 11,
 'thriller-psychologique': 1073,
 'tragedie': 601,
 'western': 533
}



# Merging files and datasets

In [435]:
def merge_data_files(books_file_list:list, books_meta_file=None, books_senti_file=None, books_users_file=None):
    df_books = None

    # fusion of mains books data files with comments
    for filename in books_file_list:
        df_books_temp = pd.read_json(filename, lines=True)
        if df_books is None:
            df_books = df_books_temp
        else: df_books = pd.concat([df_books, df_books_temp])

    df_books = df_books.drop(['tags'],axis=1)

    # genre du profil per user_id
    if books_users_file is not None:
        df_users = pd.read_json(books_users_file, lines=True)
        df_books = df_books.merge(df_users, on='user_id', how='left')
    df_books = df_books.fillna('')

    # join to the meta data books file
    if books_meta_file is not None:
        df_meta = pd.read_json(books_meta_file, lines=True)
        df_meta = df_meta.drop(['book_nb_comm', 'title', 'name', 'surname','img_url','book_date'],axis=1)
        df_books = df_books.merge(df_meta, on='book_id', how='inner')

    # join sentiments file
    if books_senti_file is not None:
        df_senti = pd.read_json(books_senti_file, lines=True)
        df_senti = df_senti.drop(['title'],axis=1)
        df_books = df_books.merge(df_senti, on='book_id', how='left')
    df_books = df_books.fillna(0)

    return df_books

In [437]:
df_comm = merge_data_files(
    books_file_list = ['../output/books-julien.json','../output/books-rebecca.json'],
    books_meta_file = '../output/books-meta-data.json',
    books_senti_file = '../output/vecteurs_sentiments.json',
    books_users_file = '../output/users-data.json'
    )

In [438]:
# df_comm['year'] = df_comm['book_date'].str.extract(r'\b(\d{4})\b').replace('1900','').replace('3889',)
# df_comm['year'].unique()

In [453]:
df_comm

Unnamed: 0,book_id,book_nb_comm,title,name,surname,img_url,comm_id,user_id,note,date,appreciations,commentaire,gender,book_url,tags,book_rating_count,book_rating_value,book_author_url,book_editor,book_pages,sen_colère,sen_confiance,sen_honte,sen_joie,sen_peur,sen_réflexion,sen_surprise,sen_tristesse
0,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,2080694,706958,4.0,30 novembre 2019,85,Roadtrip désertique dans l'Ouest Américain en ...,,https://www.babelio.com/livres/Abbey-Le-gang-d...,"[['roman', 22], ['romans policiers et polars',...",678,3.96,/auteur/Edward-Abbey/23782,Gallmeister,491,0.292288,0.18784,0.0,0.334643,0.098929,0.137829,0.092294,0.14299
1,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,1271915,319761,5.0,03 mars 2017,68,"De temps en temps, j'aime bien revenir vers l'...",F,https://www.babelio.com/livres/Abbey-Le-gang-d...,"[['roman', 22], ['romans policiers et polars',...",678,3.96,/auteur/Edward-Abbey/23782,Gallmeister,491,0.292288,0.18784,0.0,0.334643,0.098929,0.137829,0.092294,0.14299
2,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,1093854,128814,5.0,27 juin 2016,67,"Depuis peu j'ai découvert grâce à "" Cardabelle...",M,https://www.babelio.com/livres/Abbey-Le-gang-d...,"[['roman', 22], ['romans policiers et polars',...",678,3.96,/auteur/Edward-Abbey/23782,Gallmeister,491,0.292288,0.18784,0.0,0.334643,0.098929,0.137829,0.092294,0.14299
3,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,2711430,227017,4.5,22 juillet 2021,64,"Ils sont quatre. \nUne blonde , jeunette, venu...",M,https://www.babelio.com/livres/Abbey-Le-gang-d...,"[['roman', 22], ['romans policiers et polars',...",678,3.96,/auteur/Edward-Abbey/23782,Gallmeister,491,0.292288,0.18784,0.0,0.334643,0.098929,0.137829,0.092294,0.14299
4,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,2685514,138438,4.5,26 juin 2021,64,Avez-vous entendu cette info effrayante ? « La...,F,https://www.babelio.com/livres/Abbey-Le-gang-d...,"[['roman', 22], ['romans policiers et polars',...",678,3.96,/auteur/Edward-Abbey/23782,Gallmeister,491,0.292288,0.18784,0.0,0.334643,0.098929,0.137829,0.092294,0.14299
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314004,609937,59,Seuls les vautours,Nicolas,Zeimet,/couv/CVT_CVT_Seuls-les-vautours_5777.jpg,1057334,166603,5.0,09 mai 2016,2,"J'avais connu Nicolas avec son livre ""Comme un...",F,https://www.babelio.com/livres/Zeimet-Seuls-le...,"[['roman', 18], ['suspense', 18], ['roman noir...",114,4.05,/auteur/Nicolas-Zeimet/313288,10-18,552,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.00000
314005,609937,59,Seuls les vautours,Nicolas,Zeimet,/couv/CVT_CVT_Seuls-les-vautours_5777.jpg,939776,262506,5.0,07 décembre 2015,2,J'avais lu beaucoup de bien de ce livre sur Am...,,https://www.babelio.com/livres/Zeimet-Seuls-le...,"[['roman', 18], ['suspense', 18], ['roman noir...",114,4.05,/auteur/Nicolas-Zeimet/313288,10-18,552,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.00000
314006,609937,59,Seuls les vautours,Nicolas,Zeimet,/couv/CVT_CVT_Seuls-les-vautours_5777.jpg,3027068,306182,2.5,05 mai 2022,1,"A Duncan's Creek, un des plus petits villages ...",M,https://www.babelio.com/livres/Zeimet-Seuls-le...,"[['roman', 18], ['suspense', 18], ['roman noir...",114,4.05,/auteur/Nicolas-Zeimet/313288,10-18,552,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.00000
314007,609937,59,Seuls les vautours,Nicolas,Zeimet,/couv/CVT_CVT_Seuls-les-vautours_5777.jpg,2027517,14577,,02 octobre 2019,1,C'est pas écrit c'est tapé !,F,https://www.babelio.com/livres/Zeimet-Seuls-le...,"[['roman', 18], ['suspense', 18], ['roman noir...",114,4.05,/auteur/Nicolas-Zeimet/313288,10-18,552,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.00000


In [439]:
# group by per book
columns_senti = [col for col in df_comm.columns if col.startswith('sen_')]
columns_book = ['book_id', 'book_url', 'book_nb_comm', 'title', 'name', 'surname',
       'tags', 'img_url', 'book_rating_count', 'book_rating_value',
       'book_author_url', 'book_editor', 'book_pages', *columns_senti]

def reduce_comm_to_books(df):
    return df.copy().groupby(columns_book, as_index=False).count().loc[:,columns_book]

In [440]:
df_books = reduce_comm_to_books(df_comm)

We have now 2 dataset : df_comm and df_books with 2 differents aggregate levels

# Preprocessing df_books

In [441]:
# on garde que les tags avec le nom dans filter_list ou si la valeur est supérieur à filter_force_min. ca permet de rétirer les tags rares et peu importants
def tags_to_cols(df, col_name, filter_list=None, filter_force_min=24):
    df1 = df.copy()

    for index,row in df.iterrows():
        tags_as_string = row[col_name]
        tags = ast.literal_eval(tags_as_string)

        for tag in tags:
            if filter_list is not None and not tag[0].strip() in filter_list and tag[1] < filter_force_min:
                continue
            tag_name = 'tag_'+tag[0].strip().replace(' ','_').lower()
            df1.loc[df1.index == index, tag_name] = tag[1]

    df1 = df1.fillna(0)
    return df1

In [442]:
df_books = tags_to_cols(df_books, col_name='tags', filter_list=list(THEMES.keys()), filter_force_min=24)

In [448]:
df_books.to_json('../output/final/data-books.json',lines=True,orient='records')
df_comm.to_json('../output/final/data-comm.json',lines=True,orient='records')

# Simple Model

In [501]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

class BookReco:
    def __init__(self):
        self.data = None
        self.vectors = []
        self.scalars = []
        self.predict_scores = None
        self.weights = {}

    def add_vector(self, col_prefix, weight=1):
        self.weights = {**self.weights, col_prefix: weight}
        self.vectors.append({'col_name': col_prefix})

    def add_scalar(self, col_name, weight=1):
        self.weights = {**self.weights, col_name: weight}
        self.scalars.append({'col_name': col_name})

    def __get_cosine_similarity(self, prefix):
        data_temp = self.data.filter(regex=f'^{prefix}',axis=1)
        data_temp = data_temp.fillna(0)
        vec = MinMaxScaler().fit_transform(data_temp)
        return cosine_similarity(vec)

    def fit(self, data):
        self.data = data

        for i,vector in enumerate(self.vectors):
            feats_cs = self.__get_cosine_similarity(vector['col_name'])
            self.vectors[i] = {**vector, 'cosine_similar': feats_cs}

        for i,scalar in enumerate(self.scalars):
            X = self.data.loc[:,[scalar['col_name']]]
            X = MinMaxScaler().fit_transform(X)
            self.scalars[i] = {**scalar, 'scaled': X.reshape(-1)}

    # def __get_cosine_similarity_final(self,scores):
    #     X = [list(score) for score in scores]
    #     X = MinMaxScaler().fit_transform(X)
    #     cs = cosine_similarity(np.array(X).T)
    #     return MinMaxScaler().fit_transform(cs)

    def set_weight(self, weights:dict):
        self.weights = weights

    def predict(self, book_id):
        scores = []

        try:
            index_book = self.data.query('book_id == @book_id').index.values.astype(int)[0]
        except:
            print(f"Can't find book_id: {book_id} in the dataset")
            return None
        
        # get all scores, apply weight
        weight_sum = 0 # to normalize at the end, like a mean
        for vector in self.vectors:
            score = vector['cosine_similar'][index_book]
            weight = self.weights[vector['col_name']]
            weight_sum += weight
            scores.append(score*weight)
        
        for scalar in self.scalars:
            weight = self.weights[vector['col_name']]
            weight_sum += weight
            scores.append(scalar['scaled']*weight)

        # sum of all scores
        self.predict_scores = None

        for score in scores:
            if self.predict_scores is None:
                self.predict_scores = np.array(score)
            else: self.predict_scores += np.array(score)
        
        # todo : mettre scores dans un dataframe des scores avec une Serie avec index book_id + faire l'ordre inverse, selection des n premier et minmaxscaler (bof la fin)

        # normalisation of sum of all scores
        #return MinMaxScaler().fit_transform(self.predict_scores.reshape(-1,1))
        return self.predict_scores / weight_sum

    def format_prediction(self, scores, max_books):
        scores = [(i,bi) for i,bi in enumerate(scores)]
        sorted_scores = sorted(scores, key=lambda x:x[1], reverse=True)
        sorted_scores = sorted_scores[1:max_books+1]
        return [(i, self.data.iloc[i,:]['book_id'],self.data.iloc[i,:]['title'],s) for i,s in sorted_scores]
    
    def format_tojson(self, scores, max_books):

        scores = [(i,bi) for i,bi in enumerate(scores)]
        sorted_scores = sorted(scores, key=lambda x:x[1], reverse=True)
        sorted_scores = sorted_scores[1:max_books+1]

        output = []
        num = 0
        for i,s in sorted_scores:
            book = self.data.iloc[i,:]
            output.append({
                'title' : book['title'],
                'url' : book['book_url'],
                'image' : book['img_url'],
                'author' : book['surname']+' '+book['name'],
                'author_url' : book['book_author_url'],
                'score' : str(s),
            })
            if num == max_books:
                break
            num += 1
        return output


In [502]:
br = BookReco()
br.add_vector('tag_', weight=2)
br.add_vector('sen_', weight=10)
br.add_scalar('book_rating_value', weight=0)
br.add_scalar('book_nb_comm', weight=0)
br.fit(df_books)

In [500]:
scores = br.predict(1829) # De Cape et de Crocs, tome 2 : Pavillon noir !
br.set_weight({'tag_': 0.5, 'sen_': 10, 'book_rating_value': 0, 'book_nb_comm': 0})
br.format_tojson(scores, max_books=5)

{'tag_': 2, 'sen_': 10, 'book_rating_value': 0, 'book_nb_comm': 0}


[{'title': 'Harry Potter, tome 7 : Harry Potter et les reliques de la mort',
  'url': 'https://www.babelio.com/livres/Rowling-Harry-Potter-tome-7--Harry-Potter-et-les-reliques/2109',
  'image': '/couv/CVT_10230_671213.jpg',
  'author': 'Rowling J. K.',
  'author_url': '/auteur/J-K-Rowling/54013'},
 {'title': 'Jane Eyre',
  'url': 'https://www.babelio.com/livres/Bront-Jane-Eyre/5235',
  'image': 'https://images-na.ssl-images-amazon.com/images/I/419DmjFmswL._SX210_.jpg',
  'author': 'Brontë Charlotte',
  'author_url': '/auteur/Charlotte-Bront/5020'},
 {'title': 'De Cape et de Crocs, tome 2 : Pavillon noir !',
  'url': 'https://www.babelio.com/livres/Ayroles-De-Cape-et-de-Crocs-tome-2--Pavillon-noir-/1829',
  'image': '/couv/CVT_CVT_De-Cape-et-de-Crocs-tome-2--Pavillon-noir-_636.jpg',
  'author': 'Ayroles Alain',
  'author_url': '/auteur/Alain-Ayroles/2287'},
 {'title': 'La Horde du Contrevent',
  'url': 'https://www.babelio.com/livres/Damasio-La-Horde-du-Contrevent/5420',
  'image': '/co

In [508]:
import pickle
import dill

def package_model():
    import numpy as np
    with open('data/model-reco.obj', "wb") as f:
        dill.dump(br, f)

# with open('data/model-reco.obj', 'wb') as f:
#     pickle.dump(br, f, pickle.HIGHEST_PROTOCOL)