In [390]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.feature_extraction.text import CountVectorizer
import ast

THEMES = {
 'amitie': 373,
 'amour': 77,
 'autobiographie': 45,
 'aventure': 33,
 'bande-dessinee': 18,
 'biographie': 31,
 'cinema': 89,
 'classique': 28,
 'comedie-romantique': 3788,
 'comics': 142,
 'drogue': 863,
 'dystopie': 879,
 'emotion': 4100,
 'enquetes': 3988,
 'entretiens': 708,
 'essai': 13,
 'famille': 290,
 'fantastique': 7,
 'fantasy': 4,
 'geographie': 187,
 'guerre': 91,
 'humour': 15,
 'humour-noir': 621,
 'jeunesse': 14,
 'journalisme': 975,
 'litterature-americaine': 9,
 'litterature-asiatique': 1064,
 'litterature-francaise': 3,
 'manga': 12,
 'musique': 44,
 'nouvelles': 23,
 'peur': 430,
 'poesie': 25,
 'politique': 54,
 'psychologie': 65,
 'racisme': 906,
 'recit-de-voyage': 448,
 'religion': 26,
 'reportage': 3488,
 'reseaux-sociaux': 26109,
 'roman': 1,
 'roman-fantastique': 912,
 'roman-noir': 136,
 'romans-policiers-et-polars': 63883,
 'science-fiction': 6,
 'sentiments': 1770,
 'serie': 788,
 'theatre': 21,
 'thriller': 11,
 'thriller-psychologique': 1073,
 'tragedie': 601,
 'western': 533
}

# todo : https://www.doczamora.com/content-based-recommender-system-for-movies-with-tensorflow

In [109]:
# df = pd.read_json('/Users/juliendefaut/Dev/Books-recommendations/output/list-books.json', lines=True)
# df = df.drop_duplicates(subset = "book_id", keep = 'first')

# df.to_json("/Users/juliendefaut/Dev/Books-recommendations/output/list-books-noduplicate.json", orient='records', lines=True)


In [350]:
df = pd.read_json('../output/books-julien.json', lines=True)
df_meta = pd.read_json('../output/books-meta-data.json', lines=True)

In [387]:
def merge_data_files(books_file_list:list, books_meta_file=None):
    df_books = None

    # fusion of mains books data files with comments
    for filename in books_file_list:
        df_books_temp = pd.read_json(filename, lines=True)
        if df_books is None:
            df_books = df_books_temp
        else: df_books = pd.concat([df_books, df_books_temp])

    df_books = df_books.drop(['tags'],axis=1)

    # join to the meta data books file
    df_meta = pd.read_json(books_meta_file, lines=True)
    df_meta = df_meta.drop(['book_nb_comm', 'title', 'name', 'surname','img_url','book_date'],axis=1)
    df_books = df_books.merge(df_meta, on='book_id', how='inner')

    # todo : genre du profil per user_id
    return df_books

In [396]:
df_comm = merge_data_files(['../output/books-julien.json','../output/books-rebecca.json'],'../output/books-meta-data.json')

In [391]:
def tags_to_cols(df, col_name, themes):
    df1 = df.copy()
    for index,row in df.iterrows():
        tags_as_string = row[col_name]
        tags = ast.literal_eval(tags_as_string)

        for tag in tags:
            if not tag in list(themes.keys()):
                continue
            tag_name = 'tag_'+tag[0].strip().replace(' ','_').lower()
            df1.loc[df1.index == index, tag_name] = tag[1]
    
    return df1

In [394]:
# group by per book
def reduce_comm_to_books(df):
    return df.copy().groupby('book_id').count()

In [397]:
df_books = reduce_comm_to_books(df_comm)
df_books

Unnamed: 0_level_0,book_nb_comm,title,name,surname,img_url,comm_id,user_id,note,date,appreciations,commentaire,book_url,tags,book_rating_count,book_rating_value,book_author_url,book_editor,book_pages
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1497,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70
1499,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70
1508,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57
1523,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70
1526,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1462096,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38
1462133,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70
1466000,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70
1467600,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37


In [368]:
df_final.shape

(317434, 13)

In [361]:
vals = df['user_id'].value_counts()
(vals > 10).sum()

3457

In [None]:
vals[]

In [355]:
import plotly.express as px
px.histogram(x=df['user_id'].value_counts())

In [357]:
px.box(x=df['user_id'].value_counts())

In [351]:
df.head()

Unnamed: 0,book_id,book_nb_comm,title,name,surname,tags,img_url,comm_id,user_id,note,date,appreciations,commentaire
0,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,"roman,romans policiers et polars,thriller,colo...",/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,2080694,706958,4.0,30 novembre 2019,85,Roadtrip désertique dans l'Ouest Américain en ...
1,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,"roman,romans policiers et polars,thriller,colo...",/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,1271915,319761,5.0,03 mars 2017,68,"De temps en temps, j'aime bien revenir vers l'..."
2,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,"roman,romans policiers et polars,thriller,colo...",/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,1093854,128814,5.0,27 juin 2016,67,"Depuis peu j'ai découvert grâce à "" Cardabelle..."
3,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,"roman,romans policiers et polars,thriller,colo...",/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,2711430,227017,4.5,22 juillet 2021,64,"Ils sont quatre. \nUne blonde , jeunette, venu..."
4,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,"roman,romans policiers et polars,thriller,colo...",/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,2685514,138438,4.5,26 juin 2021,64,Avez-vous entendu cette info effrayante ? « La...


In [346]:
df = df.drop_duplicates(subset='book_id')

In [9]:
df.head()

Unnamed: 0,book_id,book_nb_comm,title,name,surname,tags,img_url,comm_id,user_id,note,date,appreciations,commentaire
0,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,"roman,romans policiers et polars,thriller,colo...",/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,2080694,706958,4.0,30 novembre 2019,85,Roadtrip désertique dans l'Ouest Américain en ...
10,34397,37,Aucune bête aussi féroce,Edward,Bunker,"roman,autobiographie,romans policiers et polar...",/couv/CVT_cvt_Aucune-bete-aussi-feroce_5042.jpg,637101,16287,4.0,02 septembre 2014,65,"Le changement, c'est maintenant !\nC'est certa..."
107,927980,64,Celle qui voulait conduire le tram,Catherine,Cuenca,"roman,roman historique,historique,littérature ...",https://images-eu.ssl-images-amazon.com/images...,2019582,574706,4.5,22 septembre 2019,42,Un bon petit livre sur la situation des femmes...
117,869119,75,Les belles vies,Benoît,Minville,"roman d'apprentissage,roman,littérature jeunes...",/couv/CVT_Les-Belles-Vies_5033.jpg,1938765,5865,4.0,16 juin 2019,37,Vasco et Djib' habitent la banlieue parisienne...
241,1355375,38,Le Cercueil de Job,Lance,Weller,"roman étranger,roman,roman historique,histoire...",/couv/CVT_Le-cercueil-de-Job_9332.jpg,2775696,489894,5.0,24 septembre 2021,129,Rentrée littéraire 2021 # 28\nJe referme ce li...


In [347]:
df['combined'] = df['title'] + ' ' + df['name']+ ' ' + df['surname']
df['tag2'] = df['tags'].str.replace(","," ")

In [348]:
df = df.reset_index()

In [137]:
df.head()

Unnamed: 0,index,book_id,book_nb_comm,title,name,surname,tags,img_url,comm_id,user_id,note,date,appreciations,commentaire,combined,tag2
0,0,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,"roman,romans policiers et polars,thriller,colo...",/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,2080694,706958,4.0,30 novembre 2019,85,Roadtrip désertique dans l'Ouest Américain en ...,"Le gang de la clef à molette (Ne meurs pas, ô ...",roman romans policiers et polars thriller colo...
1,10,34397,37,Aucune bête aussi féroce,Edward,Bunker,"roman,autobiographie,romans policiers et polar...",/couv/CVT_cvt_Aucune-bete-aussi-feroce_5042.jpg,637101,16287,4.0,02 septembre 2014,65,"Le changement, c'est maintenant !\nC'est certa...",Aucune bête aussi féroce Edward Bunker,roman autobiographie romans policiers et polar...
2,107,927980,64,Celle qui voulait conduire le tram,Catherine,Cuenca,"roman,roman historique,historique,littérature ...",https://images-eu.ssl-images-amazon.com/images...,2019582,574706,4.5,22 septembre 2019,42,Un bon petit livre sur la situation des femmes...,Celle qui voulait conduire le tram Catherine C...,roman roman historique historique littérature ...
3,117,869119,75,Les belles vies,Benoît,Minville,"roman d'apprentissage,roman,littérature jeunes...",/couv/CVT_Les-Belles-Vies_5033.jpg,1938765,5865,4.0,16 juin 2019,37,Vasco et Djib' habitent la banlieue parisienne...,Les belles vies Benoît Minville,roman d'apprentissage roman littérature jeunes...
4,241,1355375,38,Le Cercueil de Job,Lance,Weller,"roman étranger,roman,roman historique,histoire...",/couv/CVT_Le-cercueil-de-Job_9332.jpg,2775696,489894,5.0,24 septembre 2021,129,Rentrée littéraire 2021 # 28\nJe referme ce li...,Le Cercueil de Job Lance Weller,roman étranger roman roman historique histoire...


In [13]:
df.query('book_id == 885171')

Unnamed: 0,index,book_id,book_nb_comm,title,name,surname,tags,img_url,comm_id,user_id,note,date,appreciations,commentaire,combined,tag2
759,42647,885171,57,"La part des ombres, tome 1",Gabriel,Katz,"saga,roman,fantastique,fantasy,littérature jeu...",https://images-eu.ssl-images-amazon.com/images...,1600968,445500,4.0,26 avril 2018,73,Le lecteur a plaisir à retrouver dans ce roman...,"La part des ombres, tome 1 Gabriel Katz",saga roman fantastique fantasy littérature jeu...


In [349]:
df['note'] = pd.to_numeric(df['note'])

In [344]:
df.shape

(2802, 16)

In [138]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2802 entries, 0 to 2801
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          2802 non-null   int64  
 1   book_id        2802 non-null   int64  
 2   book_nb_comm   2802 non-null   int64  
 3   title          2802 non-null   object 
 4   name           2802 non-null   object 
 5   surname        2802 non-null   object 
 6   tags           2802 non-null   object 
 7   img_url        2802 non-null   object 
 8   comm_id        2802 non-null   int64  
 9   user_id        2802 non-null   int64  
 10  note           2759 non-null   float64
 11  date           2802 non-null   object 
 12  appreciations  2802 non-null   int64  
 13  commentaire    2802 non-null   object 
 14  combined       2802 non-null   object 
 15  tag2           2802 non-null   object 
dtypes: float64(1), int64(6), object(9)
memory usage: 350.4+ KB


In [334]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

class BookReco:
    def __init__(self):
        self.data = None
        self.features = []
        self.scalars = []
        self.predict_scores = None

    def add_feature(self, col_name, weight=1, separator=' '):
        cv = CountVectorizer(tokenizer=lambda x: x.split(separator))
        self.features.append({'col_name': col_name, 'weight': weight, 'vector': cv})

    def add_scalar(self, col_name, weight=1):
        self.scalars.append({'col_name': col_name, 'weight': weight})

    def fit(self, data):
        self.data = data

        for i,feature in enumerate(self.features):
            cm = feature['vector'].fit_transform(self.data[feature['col_name']])
            cs = cosine_similarity(cm)
            self.features[i] = {**feature, 'cosine_similar': cs}

        for i,scalar in enumerate(self.scalars):
            X = self.data.loc[:,[scalar['col_name']]]
            X = MinMaxScaler().fit_transform(X)
            self.scalars[i] = {**scalar, 'scaled': X.reshape(-1)}

    def predict(self, book_id):
        scores = []
        index_book = self.data.query('book_id == @book_id').index.values.astype(int)[0]
        
        # get all scores, apply weight
        for feature in self.features:
            score = feature['cosine_similar'][index_book]
            scores.append(score*feature['weight'])
        
        for scalar in self.scalars:
            scores.append(scalar['scaled']*scalar['weight'])

        # sum of all scores
        self.predict_scores = None
        for score in scores:
            if self.predict_scores is None:
                self.predict_scores = np.array(score)
            else: self.predict_scores += np.array(score)

        # normalisation of sum of all scores
        self.predict_scores = MinMaxScaler().fit_transform(self.predict_scores.reshape(-1,1))
        return self.predict_scores

    def format_prediction(self, scores, max_books):
        scores = [(i,bi) for i,bi in enumerate(scores)]
        sorted_scores = sorted(scores, key=lambda x:x[1], reverse=True)
        sorted_scores = sorted_scores[1:max_books+1]
        return [(i, self.data.iloc[i,:]['book_id'],self.data.iloc[i,:]['title'],s) for i,s in sorted_scores]

In [342]:
br = BookReco()
br.add_feature('tags', weight=3, separator=',')
br.add_feature('sentiments', weight=10, separator=',')
br.add_feature('users', weight=0, separator=',')
br.add_scalar('note', weight=2)
br.add_scalar('appreciations', weight=2)
br.add_scalar('book_nb_comm', weight=0.1)
br.fit(df)

In [343]:
scores = br.predict(885171) # La part des ombres, tome 1
br.format_prediction(scores, max_books=5)

[(2123, 5608, 'Des fleurs pour Algernon', array([0.43884351])),
 (2103, 7668, "L'ami retrouvé", array([0.41056662])),
 (2305, 675578, 'Vernon Subutex, tome 1', array([0.35406303])),
 (1404, 8529, 'Croc-Blanc', array([0.34502479])),
 (1413, 343802, 'Juste une ombre', array([0.30120482]))]

In [108]:
br.predict(1316140)

[[(1366, 1201119, 'Wyld, tome 2 : Rose de Sang'),
  (35, 23516, 'Tara Duncan, Tome 3 : Le Sceptre Maudit'),
  (140, 243528, "A comme Association, Tome 5 : Là où les mots n'existent pas"),
  (550, 761910, "Aeternia, Tome 2 : L'envers du monde"),
  (1020, 7718, 'Le livre des étoiles, tome 1 : Qadehar le Sorcier')]]

In [58]:
cv = CountVectorizer(tokenizer=lambda x: x.split(','))
cm = cv.fit_transform(df['note'])

In [59]:
cm

<2802x11 sparse matrix of type '<class 'numpy.int64'>'
	with 2802 stored elements in Compressed Sparse Row format>

In [60]:
cv.vocabulary_

{'4.0': 8,
 '4.5': 9,
 '5.0': 10,
 '3.5': 7,
 '3.0': 6,
 '': 0,
 '1.0': 2,
 '2.5': 5,
 '0.5': 1,
 '2.0': 4,
 '1.5': 3}

In [45]:
cm

<2802x2800 sparse matrix of type '<class 'numpy.int64'>'
	with 2357 stored elements in Compressed Sparse Row format>

In [61]:
cs = cosine_similarity(cm)

In [62]:
cs

array([[1., 1., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.],
       ...,
       [0., 0., 1., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [63]:
def reco_book(book_id, max_book=5):
    index_book = df.query('book_id == @book_id').index.values.astype(int)[0]
    cs_book = cs[index_book]
    scores = [(i,bi) for i,bi in enumerate(cs_book)]
    sorted_scores = sorted(scores, key=lambda x:x[1], reverse=True)
    sorted_scores = sorted_scores[1:max_book+1]
    reco = [(i, df.iloc[i,:]['book_id'],df.iloc[i,:]['title']) for i,s in sorted_scores]
    return reco


In [107]:
reco_book(927980) # Celle qui voulait conduire le tram

[(10, 877839, 'Et tu trouveras le trésor qui dort en toi'),
 (16, 946624, 'Effroyable porcelaine'),
 (24, 1212813, 'Eleonor & Grey'),
 (63, 486263, 'Cruelles'),
 (64, 1193525, 'Aldobrando')]

In [105]:
reco_book(885171) # La part des ombres, tome 1

[(1, 34397, 'Aucune bête aussi féroce'),
 (3, 869119, 'Les belles vies'),
 (8, 466046, 'Bacha posh'),
 (13, 1022159, 'Le jøurnal de ma disparitiøn'),
 (18, 614802, 'Les Grands')]