In [217]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.feature_extraction.text import CountVectorizer
import ast
import warnings

warnings.filterwarnings('ignore') 

THEMES = {
 'amitie': 373,
 'amour': 77,
 'autobiographie': 45,
 'aventure': 33,
 'bande-dessinee': 18,
 'biographie': 31,
 'cinema': 89,
 'classique': 28,
 'comedie-romantique': 3788,
 'comics': 142,
 'drogue': 863,
 'dystopie': 879,
 'emotion': 4100,
 'enquetes': 3988,
 'entretiens': 708,
 'essai': 13,
 'famille': 290,
 'fantastique': 7,
 'fantasy': 4,
 'geographie': 187,
 'guerre': 91,
 'humour': 15,
 'humour-noir': 621,
 'jeunesse': 14,
 'journalisme': 975,
 'litterature-americaine': 9,
 'litterature-asiatique': 1064,
 'litterature-francaise': 3,
 'manga': 12,
 'musique': 44,
 'nouvelles': 23,
 'peur': 430,
 'poesie': 25,
 'politique': 54,
 'psychologie': 65,
 'racisme': 906,
 'recit-de-voyage': 448,
 'religion': 26,
 'reportage': 3488,
 'reseaux-sociaux': 26109,
 'roman': 1,
 'roman-fantastique': 912,
 'roman-noir': 136,
 'romans-policiers-et-polars': 63883,
 'science-fiction': 6,
 'sentiments': 1770,
 'serie': 788,
 'theatre': 21,
 'thriller': 11,
 'thriller-psychologique': 1073,
 'tragedie': 601,
 'western': 533
}


# Merging files and datasets

In [218]:
def merge_data_files(books_file_list:list, books_meta_file=None, books_senti_file=None):
    df_books = None

    # fusion of mains books data files with comments
    for filename in books_file_list:
        df_books_temp = pd.read_json(filename, lines=True)
        if df_books is None:
            df_books = df_books_temp
        else: df_books = pd.concat([df_books, df_books_temp])

    df_books = df_books.drop(['tags'],axis=1)

    # join to the meta data books file
    if books_meta_file is not None:
        df_meta = pd.read_json(books_meta_file, lines=True)
        df_meta = df_meta.drop(['book_nb_comm', 'title', 'name', 'surname','img_url','book_date'],axis=1)
        df_books = df_books.merge(df_meta, on='book_id', how='inner')

    if books_senti_file is not None:
        df_senti = pd.read_json(books_senti_file, lines=True)
        df_senti = df_senti.drop(['title'],axis=1)
        df_books = df_books.merge(df_senti, on='book_id', how='inner')

    df_books = df_books.fillna(0)
    # todo : genre du profil per user_id
    return df_books

In [219]:
df_comm = merge_data_files(
    books_file_list = ['../output/books-julien.json','../output/books-rebecca.json'],
    books_meta_file = '../output/books-meta-data.json',
    books_senti_file = '../output/vecteurs_sentiments.json')

In [220]:
# group by per book
columns_senti = [col for col in df_comm.columns if col.startswith('sen_')]
columns_book = ['book_id', 'book_url', 'book_nb_comm', 'title', 'name', 'surname',
       'tags', 'img_url', 'book_rating_count', 'book_rating_value',
       'book_author_url', 'book_editor', 'book_pages', *columns_senti]

def reduce_comm_to_books(df):
    return df.copy().groupby(columns_book, as_index=False).count().loc[:,columns_book]

In [221]:
df_books = reduce_comm_to_books(df_comm)

We have now 2 dataset : df_comm and df_books with 2 differents aggregate levels

# Preprocessing df_books

In [222]:
# on garde que les tags avec le nom dans filter_list ou si la valeur est supérieur à filter_force_min. ca permet de rétirer les tags rares et peu importants
def tags_to_cols(df, col_name, filter_list=None, filter_force_min=24):
    df1 = df.copy()

    for index,row in df.iterrows():
        tags_as_string = row[col_name]
        tags = ast.literal_eval(tags_as_string)

        for tag in tags:
            if filter_list is not None and not tag[0].strip() in filter_list and tag[1] < filter_force_min:
                continue
            tag_name = 'tag_'+tag[0].strip().replace(' ','_').lower()
            df1.loc[df1.index == index, tag_name] = tag[1]

    df1 = df1.fillna(0)
    return df1

In [223]:
df_books = tags_to_cols(df_books, col_name='tags', filter_list=list(THEMES.keys()), filter_force_min=24)

In [224]:
df_books

Unnamed: 0,book_id,book_url,book_nb_comm,title,name,surname,tags,img_url,book_rating_count,book_rating_value,...,tag_roman_graphique,tag_lgbtq+,tag_légendes,tag_space-opera,tag_résilience,tag_vaudou,tag_vieillesse,tag_sicile_(italie),tag_héritage,tag_identité
0,1578,https://www.babelio.com/livres/Loisel-La-Quete...,42,"La Quête de l'oiseau du temps, tome 1 : La con...",Régis,Loisel,"[['aventure', 17], ['serie', 15], ['saga', 15]...",https://images-na.ssl-images-amazon.com/images...,628,4.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1700,https://www.babelio.com/livres/Beaumarchais-Le...,85,Le Mariage de Figaro,,Beaumarchais,"[['satire', 15], ['comédie', 19], ['théâtre cl...",https://images-na.ssl-images-amazon.com/images...,1823,3.65,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1725,https://www.babelio.com/livres/Buzzati-Le-dese...,205,Le désert des Tartares,Dino,Buzzati,"[['roman', 24], ['fantastique', 17], ['classiq...",https://m.media-amazon.com/images/I/41CPam0DPt...,2435,4.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1829,https://www.babelio.com/livres/Ayroles-De-Cape...,34,"De Cape et de Crocs, tome 2 : Pavillon noir !",Alain,Ayroles,"[['aventure', 19], [""de capes et d'épées"", 18]...",/couv/CVT_CVT_De-Cape-et-de-Crocs-tome-2--Pavi...,429,4.30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2109,https://www.babelio.com/livres/Rowling-Harry-P...,601,"Harry Potter, tome 7 : Harry Potter et les rel...",J. K.,Rowling,"[['aventure', 18], ['saga', 17], ['roman', 21]...",/couv/CVT_10230_671213.jpg,15883,4.54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
903,1452627,https://www.babelio.com/livres/-Jamais-plus/14...,399,Jamais plus,Colleen,Hoover,"[['journal intime', 14], ['roman', 19], ['jeun...",https://images-eu.ssl-images-amazon.com/images...,2113,4.38,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
904,1453053,https://www.babelio.com/livres/-Grace-and-Fury...,64,"Grace and Fury, tome 1 : Fleurs de fer",Tracy,Banghart,"[['saga', 14], ['aventure', 19], ['roman', 14]...",https://m.media-amazon.com/images/I/516CV-xP6p...,104,4.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
905,1453136,https://www.babelio.com/livres/-Vers-les-etoil...,64,Vers les étoiles,Mary Robinette,Kowal,"[['uchronie', 21], ['roman', 15], ['dystopie',...",https://m.media-amazon.com/images/I/41JI86XiUj...,168,3.83,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
906,1455462,https://www.babelio.com/livres/-Les-gardiens-d...,62,"Les gardiens d'Ülgeriin, tome 1 : Rose éternelle",Ophélie,Duchemin,"[['saga', 14], ['fantasy', 26], ['fantastique'...",/couv/CVT_Rose-eternelle_6800.jpg,106,4.35,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Simple Model

In [331]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

class BookReco:
    def __init__(self):
        self.data = None
        self.vectors = []
        self.scalars = []
        self.predict_scores = None

    def add_vector(self, col_prefix, weight=1):
        self.vectors.append({'col_name': col_prefix, 'weight': weight})

    def add_scalar(self, col_name, weight=1):
        self.scalars.append({'col_name': col_name, 'weight': weight})

    def __get_cosine_similarity(self, prefix):
        data_temp = self.data.filter(regex=f'^{prefix}',axis=1)
        data_temp = data_temp.fillna(0)
        vec = MinMaxScaler().fit_transform(data_temp)
        return cosine_similarity(vec)

    def fit(self, data):
        self.data = data

        for i,vector in enumerate(self.vectors):
            feats_cs = self.__get_cosine_similarity(vector['col_name'])
            self.vectors[i] = {**vector, 'cosine_similar': feats_cs}

        for i,scalar in enumerate(self.scalars):
            X = self.data.loc[:,[scalar['col_name']]]
            X = MinMaxScaler().fit_transform(X)
            self.scalars[i] = {**scalar, 'scaled': X.reshape(-1)}

    # def __get_cosine_similarity_final(self,scores):
    #     X = [list(score) for score in scores]
    #     X = MinMaxScaler().fit_transform(X)
    #     cs = cosine_similarity(np.array(X).T)
    #     return MinMaxScaler().fit_transform(cs)

    def predict(self, book_id):
        scores = []

        try:
            index_book = self.data.query('book_id == @book_id').index.values.astype(int)[0]
        except:
            print(f"Can't find book_id: {book_id} in the dataset")
            return None
        
        # get all scores, apply weight
        for vector in self.vectors:
            score = vector['cosine_similar'][index_book]
            scores.append(score*vector['weight'])
        
        for scalar in self.scalars:
            scores.append(scalar['scaled']*scalar['weight'])

        # sum of all scores
        self.predict_scores = None

        # cs = self.__get_cosine_similarity_final(scores)
        # self.predict_scores = cs[index_book]

        for score in scores:
            if self.predict_scores is None:
                self.predict_scores = np.array(score)
            else: self.predict_scores += np.array(score)

        # normalisation of sum of all scores
        self.predict_scores = MinMaxScaler().fit_transform(self.predict_scores.reshape(-1,1))
        return self.predict_scores

    def format_prediction(self, scores, max_books):
        scores = [(i,bi) for i,bi in enumerate(scores)]
        sorted_scores = sorted(scores, key=lambda x:x[1], reverse=True)
        sorted_scores = sorted_scores[1:max_books+1]
        return [(i, self.data.iloc[i,:]['book_id'],self.data.iloc[i,:]['title'],s) for i,s in sorted_scores]

In [337]:
br = BookReco()
br.add_vector('tag_', weight=10)
br.add_vector('sen_', weight=2)
br.add_scalar('book_rating_value', weight=0)
br.add_scalar('book_nb_comm', weight=0)
br.fit(df_books)

In [338]:
scores = br.predict(1829) # De Cape et de Crocs, tome 2 : Pavillon noir !
br.format_prediction(scores, max_books=5)

[(449, 790196, 'Sykes', 0.9999977721706684),
 (605,
  1022303,
  'Blackwing, tome 1 : La marque du corbeau',
  0.9999954102836703),
 (191,
  142810,
  "Freaks' Squeele, Tome 1 : Etrange université",
  0.9999920404444045),
 (47,
  5621,
  'Les Légendaires, tome 1 : La Pierre de Jovenia',
  0.9999905235773728),
 (30, 4380, 'XIII, Tome 5 : Rouge Total', 0.9999874753615673)]