In [423]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.feature_extraction.text import CountVectorizer
import ast
import warnings

warnings.filterwarnings('ignore') 

pd.set_option("display.max_columns", 100) # ‘None’ value means unlimited.

THEMES = {
 'amitie': 373,
 'amour': 77,
 'autobiographie': 45,
 'aventure': 33,
 'bande-dessinee': 18,
 'biographie': 31,
 'cinema': 89,
 'classique': 28,
 'comedie-romantique': 3788,
 'comics': 142,
 'drogue': 863,
 'dystopie': 879,
 'emotion': 4100,
 'enquetes': 3988,
 'entretiens': 708,
 'essai': 13,
 'famille': 290,
 'fantastique': 7,
 'fantasy': 4,
 'geographie': 187,
 'guerre': 91,
 'humour': 15,
 'humour-noir': 621,
 'jeunesse': 14,
 'journalisme': 975,
 'litterature-americaine': 9,
 'litterature-asiatique': 1064,
 'litterature-francaise': 3,
 'manga': 12,
 'musique': 44,
 'nouvelles': 23,
 'peur': 430,
 'poesie': 25,
 'politique': 54,
 'psychologie': 65,
 'racisme': 906,
 'recit-de-voyage': 448,
 'religion': 26,
 'reportage': 3488,
 'reseaux-sociaux': 26109,
 'roman': 1,
 'roman-fantastique': 912,
 'roman-noir': 136,
 'romans-policiers-et-polars': 63883,
 'science-fiction': 6,
 'sentiments': 1770,
 'serie': 788,
 'theatre': 21,
 'thriller': 11,
 'thriller-psychologique': 1073,
 'tragedie': 601,
 'western': 533
}



# Merging files and datasets

In [435]:
def merge_data_files(books_file_list:list, books_meta_file=None, books_senti_file=None, books_users_file=None):
    df_books = None

    # fusion of mains books data files with comments
    for filename in books_file_list:
        df_books_temp = pd.read_json(filename, lines=True)
        if df_books is None:
            df_books = df_books_temp
        else: df_books = pd.concat([df_books, df_books_temp])

    df_books = df_books.drop(['tags'],axis=1)

    # genre du profil per user_id
    if books_users_file is not None:
        df_users = pd.read_json(books_users_file, lines=True)
        df_books = df_books.merge(df_users, on='user_id', how='left')
    df_books = df_books.fillna('')

    # join to the meta data books file
    if books_meta_file is not None:
        df_meta = pd.read_json(books_meta_file, lines=True)
        df_meta = df_meta.drop(['book_nb_comm', 'title', 'name', 'surname','img_url','book_date'],axis=1)
        df_books = df_books.merge(df_meta, on='book_id', how='inner')

    # join sentiments file
    if books_senti_file is not None:
        df_senti = pd.read_json(books_senti_file, lines=True)
        df_senti = df_senti.drop(['title'],axis=1)
        df_books = df_books.merge(df_senti, on='book_id', how='left')
    df_books = df_books.fillna(0)

    return df_books

In [437]:
df_comm = merge_data_files(
    books_file_list = ['../output/books-julien.json','../output/books-rebecca.json'],
    books_meta_file = '../output/books-meta-data.json',
    books_senti_file = '../output/vecteurs_sentiments.json',
    books_users_file = '../output/users-data.json'
    )

In [438]:
# df_comm['year'] = df_comm['book_date'].str.extract(r'\b(\d{4})\b').replace('1900','').replace('3889',)
# df_comm['year'].unique()

In [512]:
df_found = df_books.loc[df_books['title'].str.contains('vautours', case=False)].reset_index()
title = df_found.loc[0,'title']
title

'Seuls les vautours'

In [439]:
# group by per book
columns_senti = [col for col in df_comm.columns if col.startswith('sen_')]
columns_book = ['book_id', 'book_url', 'book_nb_comm', 'title', 'name', 'surname',
       'tags', 'img_url', 'book_rating_count', 'book_rating_value',
       'book_author_url', 'book_editor', 'book_pages', *columns_senti]

def reduce_comm_to_books(df):
    return df.copy().groupby(columns_book, as_index=False).count().loc[:,columns_book]

In [440]:
df_books = reduce_comm_to_books(df_comm)

We have now 2 dataset : df_comm and df_books with 2 differents aggregate levels

# Preprocessing df_books

In [441]:
# on garde que les tags avec le nom dans filter_list ou si la valeur est supérieur à filter_force_min. ca permet de rétirer les tags rares et peu importants
def tags_to_cols(df, col_name, filter_list=None, filter_force_min=24):
    df1 = df.copy()

    for index,row in df.iterrows():
        tags_as_string = row[col_name]
        tags = ast.literal_eval(tags_as_string)

        for tag in tags:
            if filter_list is not None and not tag[0].strip() in filter_list and tag[1] < filter_force_min:
                continue
            tag_name = 'tag_'+tag[0].strip().replace(' ','_').lower()
            df1.loc[df1.index == index, tag_name] = tag[1]

    df1 = df1.fillna(0)
    return df1

In [442]:
df_books = tags_to_cols(df_books, col_name='tags', filter_list=list(THEMES.keys()), filter_force_min=24)

In [448]:
df_books.to_json('../output/final/data-books.json',lines=True,orient='records')
df_comm.to_json('../output/final/data-comm.json',lines=True,orient='records')

In [516]:
df_jaccard = pd.read_csv('../output/jacsim.csv')
df_jaccard = df_jaccard.rename(columns = {'Unnamed: 0':'book_id'})

In [517]:
df_jaccard

Unnamed: 0,book_id,1379642,466996,123961,338279,1300939,1079315,832220,56385,676627,698482,1127856,650958,1390851,830641,717829,5742,1186100,1224013,395220,903004,494737,1261456,40426,1000388,1358497,944573,766685,965802,17544,1185527,934827,1231250,1035039,574645,431764,1295258,1310212,124380,987659,335559,6999,14022,799846,220453,45277,132908,1299126,797175,1385832,...,901109,7643,826777,808398,159116,556212,1192722,809728,1068833,1252905,766702,6532,1277512,201751,1241904,1158955,4397,23208,1270461,1316806,10374,2287,991495,1322555,1003722,1720,1282794,1353562,127445,886136,1308933,4696,916012,950395,534643,501077,1232499,6066,983150,357202,20799,812075,895066,22977,40020,1000176,794245,225627,1379634,81141
0,1379642,1.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,466996,0.791045,1.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,123961,0.742857,0.928571,1.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,338279,0.690141,0.859649,0.862069,1.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1300939,0.728571,0.877193,0.847458,0.844828,1.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,1000176,0.728571,0.910714,0.879310,0.877193,0.862069,0.731343,0.883333,0.864407,0.830508,0.877193,0.822581,0.796875,0.877193,0.864407,0.852459,0.910714,0.704225,0.460177,0.838710,0.892857,0.827586,0.862069,0.896552,0.881356,0.833333,0.757576,0.847458,0.864407,0.803279,0.816667,0.913793,0.604938,0.722222,0.910714,0.866667,0.510417,0.504950,0.803279,0.838710,0.945455,0.833333,0.910714,0.864407,0.875000,0.875000,0.844828,0.671233,0.758621,0.695652,...,0.510000,0.910714,0.850000,0.896552,0.850000,0.862069,0.807018,0.896552,0.810345,0.847458,0.847458,0.912281,0.850000,0.857143,0.635135,0.879310,0.784615,0.815385,0.838710,0.657895,0.875000,0.877193,0.815385,0.770492,0.812500,0.866667,0.701493,0.428571,0.892857,0.877193,0.710145,0.833333,0.769231,0.809524,0.819672,0.753846,0.731343,0.821429,0.761194,0.864407,0.833333,0.881356,0.850000,0.862069,0.866667,1.000000,,,,
1496,794245,0.764706,0.962963,0.928571,0.859649,0.844828,0.796875,0.836066,0.912281,0.910714,0.962963,0.866667,0.838710,0.962963,0.912281,0.866667,0.962963,0.764706,0.477477,0.852459,0.944444,0.909091,0.844828,0.912281,0.896552,0.912281,0.769231,0.894737,0.879310,0.879310,0.894737,0.896552,0.675325,0.732394,0.927273,0.881356,0.531915,0.525253,0.847458,0.852459,0.927273,0.912281,0.962963,0.912281,0.962264,0.962264,0.927273,0.704225,0.803571,0.757576,...,0.530612,0.962963,0.896552,0.912281,0.896552,0.910714,0.854545,0.912281,0.890909,0.862069,0.928571,0.928571,0.896552,0.943396,0.690141,0.928571,0.825397,0.800000,0.852459,0.712329,0.962264,0.962963,0.800000,0.813559,0.825397,0.881356,0.765625,0.456897,0.981132,0.962963,0.746269,0.847458,0.809524,0.822581,0.864407,0.822581,0.796875,0.836364,0.828125,0.912281,0.879310,0.896552,0.896552,0.910714,0.850000,0.877193,1.000000,,,
1497,225627,0.700000,0.875000,0.844828,0.875000,0.766667,0.753846,0.819672,0.894737,0.892857,0.875000,0.850000,0.822581,0.875000,0.894737,0.850000,0.875000,0.700000,0.429825,0.866667,0.925926,0.925926,0.827586,0.862069,0.847458,0.830508,0.809524,0.844828,0.928571,0.894737,0.813559,0.816667,0.620253,0.671233,0.875000,0.833333,0.505263,0.485149,0.830508,0.777778,0.875000,0.894737,0.875000,0.830508,0.872727,0.907407,0.842105,0.714286,0.785714,0.769231,...,0.520408,0.875000,0.816667,0.862069,0.879310,0.827586,0.836364,0.830508,0.839286,0.877193,0.910714,0.877193,0.816667,0.888889,0.676056,0.945455,0.809524,0.731343,0.806452,0.698630,0.907407,0.909091,0.731343,0.766667,0.838710,0.896552,0.696970,0.411765,0.890909,0.875000,0.705882,0.800000,0.738462,0.806452,0.879310,0.750000,0.727273,0.785714,0.784615,0.830508,0.894737,0.816667,0.912281,0.859649,0.803279,0.827586,0.875000,1.000,,
1498,1379634,0.732394,0.879310,0.850000,0.786885,0.803279,0.735294,0.769231,0.836066,0.803279,0.879310,0.796875,0.772727,0.912281,0.836066,0.796875,0.879310,0.757143,0.452174,0.841270,0.862069,0.830508,0.833333,0.836066,0.822581,0.866667,0.710145,0.850000,0.806452,0.806452,0.819672,0.822581,0.609756,0.657895,0.847458,0.838710,0.500000,0.524752,0.806452,0.812500,0.847458,0.806452,0.879310,0.866667,0.877193,0.877193,0.879310,0.722222,0.733333,0.700000,...,0.514851,0.879310,0.852459,0.866667,0.852459,0.833333,0.810345,0.836066,0.844828,0.819672,0.850000,0.850000,0.852459,0.859649,0.662162,0.850000,0.761194,0.739130,0.812500,0.706667,0.877193,0.879310,0.818182,0.746032,0.761194,0.809524,0.731343,0.457627,0.894737,0.879310,0.690141,0.777778,0.772727,0.784615,0.793651,0.812500,0.735294,0.793103,0.791045,0.836066,0.836066,0.852459,0.852459,0.864407,0.809524,0.803279,0.879310,0.800,1.00000,


# Simple Model

In [518]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

class BookReco:
    def __init__(self):
        self.data = None
        self.vectors = []
        self.scalars = []
        self.predict_scores = None
        self.weights = {}
        self.df_jaccard = None

    def add_vector(self, col_prefix, weight=1):
        self.weights = {**self.weights, col_prefix: weight}
        self.vectors.append({'col_name': col_prefix})

    def add_jaccard(self, df, weight=1):
        self.weights = {**self.weights, 'jaccard': weight}
        self.df_jaccard = df

    def add_scalar(self, col_name, weight=1):
        self.weights = {**self.weights, col_name: weight}
        self.scalars.append({'col_name': col_name})

    def __get_cosine_similarity(self, prefix):
        data_temp = self.data.filter(regex=f'^{prefix}',axis=1)
        data_temp = data_temp.fillna(0)
        vec = MinMaxScaler().fit_transform(data_temp)
        return cosine_similarity(vec)


    def fit(self, data):
        self.data = data

        for i,vector in enumerate(self.vectors):
            feats_cs = self.__get_cosine_similarity(vector['col_name'])
            self.vectors[i] = {**vector, 'cosine_similar': feats_cs}

        for i,scalar in enumerate(self.scalars):
            X = self.data.loc[:,[scalar['col_name']]]
            X = MinMaxScaler().fit_transform(X)
            self.scalars[i] = {**scalar, 'scaled': X.reshape(-1)}

    # def __get_cosine_similarity_final(self,scores):
    #     X = [list(score) for score in scores]
    #     X = MinMaxScaler().fit_transform(X)
    #     cs = cosine_similarity(np.array(X).T)
    #     return MinMaxScaler().fit_transform(cs)

    def set_weight(self, weights:dict):
        self.weights = weights

    def predict(self, book_id):
        scores = []

        try:
            index_book = self.data.query('book_id == @book_id').index.values.astype(int)[0]
        except:
            print(f"Can't find book_id: {book_id} in the dataset")
            return None
        
        # get all scores, apply weight
        weight_sum = 0 # to normalize at the end, like a mean
        for vector in self.vectors:
            score = vector['cosine_similar'][index_book]
            weight = self.weights[vector['col_name']]
            weight_sum += weight
            scores.append(score*weight)
        
        for scalar in self.scalars:
            weight = self.weights[vector['col_name']]
            weight_sum += weight
            scores.append(scalar['scaled']*weight)

        # sum of all scores
        self.predict_scores = None

        for score in scores:
            if self.predict_scores is None:
                self.predict_scores = np.array(score)
            else: self.predict_scores += np.array(score)
        
        # todo : mettre scores dans un dataframe des scores avec une Serie avec index book_id + faire l'ordre inverse, selection des n premier et minmaxscaler (bof la fin)

        # normalisation of sum of all scores
        #return MinMaxScaler().fit_transform(self.predict_scores.reshape(-1,1))
        return self.predict_scores / weight_sum

    def format_prediction(self, scores, max_books):
        scores = [(i,bi) for i,bi in enumerate(scores)]
        sorted_scores = sorted(scores, key=lambda x:x[1], reverse=True)
        sorted_scores = sorted_scores[1:max_books+1]
        return [(i, self.data.iloc[i,:]['book_id'],self.data.iloc[i,:]['title'],s) for i,s in sorted_scores]
    
    def format_tojson(self, scores, max_books):

        scores = [(i,bi) for i,bi in enumerate(scores)]
        sorted_scores = sorted(scores, key=lambda x:x[1], reverse=True)
        sorted_scores = sorted_scores[1:max_books+1]

        output = []
        num = 0
        for i,s in sorted_scores:
            book = self.data.iloc[i,:]
            output.append({
                'title' : book['title'],
                'url' : book['book_url'],
                'image' : book['img_url'],
                'author' : book['surname']+' '+book['name'],
                'author_url' : book['book_author_url'],
                'score' : str(s),
            })
            if num == max_books:
                break
            num += 1
        return output


In [521]:
br = BookReco()
# br.add_vector('tag_', weight=2)
br.add_vector('sen_', weight=10)
br.add_jaccard(df_jaccard, weight=1)
# br.add_scalar('book_rating_value', weight=0)
# br.add_scalar('book_nb_comm', weight=0)
br.fit(df_books)

In [522]:
scores = br.predict(1829) # De Cape et de Crocs, tome 2 : Pavillon noir !
br.set_weight({'tag_': 0.5, 'sen_': 10, 'book_rating_value': 0, 'book_nb_comm': 0})
br.format_tojson(scores, max_books=5)

[{'title': 'La nuit du cœur',
  'url': 'https://www.babelio.com/livres/Bobin-La-nuit-du-cur/1265292',
  'image': 'https://images-eu.ssl-images-amazon.com/images/I/31BXkkX7onL._SX195_.jpg',
  'author': 'Bobin Christian',
  'author_url': '/auteur/Christian-Bobin/2713',
  'score': '0.9978742836066334'},
 {'title': 'Astérix, tome 21 : Le cadeau de César',
  'url': 'https://www.babelio.com/livres/Goscinny-Asterix-tome-21--Le-cadeau-de-Cesar/618790',
  'image': 'https://images-na.ssl-images-amazon.com/images/I/51NuvzuPcjL._SX210_.jpg',
  'author': 'Goscinny René',
  'author_url': '/auteur/Rene-Goscinny/2587',
  'score': '0.988568772354145'},
 {'title': 'Le Mariage de Figaro',
  'url': 'https://www.babelio.com/livres/Beaumarchais-Le-Mariage-de-Figaro/1700',
  'image': 'https://images-na.ssl-images-amazon.com/images/I/51BBECGFERL._SX210_.jpg',
  'author': 'Beaumarchais ',
  'author_url': '/auteur/-Beaumarchais/2165',
  'score': '0.9774130527279642'},
 {'title': 'Le Petit Chaperon rouge',
  'ur

In [508]:
import pickle
import dill

with open('data/model-reco.obj', "wb") as f:
    dill.dump(br, f)

# with open('data/model-reco.obj', 'wb') as f:
#     pickle.dump(br, f, pickle.HIGHEST_PROTOCOL)