In [215]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
import ast
import warnings

warnings.filterwarnings('ignore') 

pd.set_option("display.max_columns", 100) # ‘None’ value means unlimited.

THEMES = {
 'amitie': 373,
 'amour': 77,
 'autobiographie': 45,
 'aventure': 33,
 'bande-dessinee': 18,
 'biographie': 31,
 'cinema': 89,
 'classique': 28,
 'comedie-romantique': 3788,
 'comics': 142,
 'drogue': 863,
 'dystopie': 879,
 'emotion': 4100,
 'enquetes': 3988,
 'entretiens': 708,
 'essai': 13,
 'famille': 290,
 'fantastique': 7,
 'fantasy': 4,
 'geographie': 187,
 'guerre': 91,
 'humour': 15,
 'humour-noir': 621,
 'jeunesse': 14,
 'journalisme': 975,
 'litterature-americaine': 9,
 'litterature-asiatique': 1064,
 'litterature-francaise': 3,
 'manga': 12,
 'musique': 44,
 'nouvelles': 23,
 'peur': 430,
 'poesie': 25,
 'politique': 54,
 'psychologie': 65,
 'racisme': 906,
 'recit-de-voyage': 448,
 'religion': 26,
 'reportage': 3488,
 'reseaux-sociaux': 26109,
 'roman': 1,
 'roman-fantastique': 912,
 'roman-noir': 136,
 'romans-policiers-et-polars': 63883,
 'science-fiction': 6,
 'sentiments': 1770,
 'serie': 788,
 'theatre': 21,
 'thriller': 11,
 'thriller-psychologique': 1073,
 'tragedie': 601,
 'western': 533
}



# Merging files and datasets

In [216]:
def merge_data_files(books_file_list:list, books_meta_file=None, books_senti_file=None, books_users_file=None):
    df_books = None

    # fusion of mains books data files with comments
    for filename in books_file_list:
        df_books_temp = pd.read_json(filename, lines=True)
        if df_books is None:
            df_books = df_books_temp
        else: df_books = pd.concat([df_books, df_books_temp])

    df_books = df_books.drop(['tags'],axis=1)

    # genre du profil per user_id
    if books_users_file is not None:
        df_users = pd.read_json(books_users_file, lines=True)
        df_books = df_books.merge(df_users, on='user_id', how='left')
    df_books = df_books.fillna('')

    # join to the meta data books file
    if books_meta_file is not None:
        df_meta = pd.read_json(books_meta_file, lines=True)
        df_meta = df_meta.drop(['book_nb_comm', 'title', 'name', 'surname','img_url','book_date'],axis=1)
        df_books = df_books.merge(df_meta, on='book_id', how='inner')

    # join sentiments file
    if books_senti_file is not None:
        df_senti = pd.read_json(books_senti_file, lines=True)
        df_senti = df_senti.drop(['title'],axis=1)
        df_books = df_books.merge(df_senti, on='book_id', how='left')
    df_books = df_books.fillna(0)

    return df_books

In [217]:
# df_comm = merge_data_files(
#     books_file_list = ['../output/books-julien.json','../output/books-rebecca.json'],
#     books_meta_file = '../output/books-meta-data.json',
#     books_senti_list = ['../analyse/df_nlp0_1000.csv','../analyse/df_nlp1000_2000.csv','../analyse/df_nlp2000_3000.csv',
#     '../analyse/df_nlp3000_4000.csv','../analyse/df_nlp3000_4000.csv','../analyse/df_nlp4000_4566.csv'],
#     books_users_file = '../output/users-data.json'
#     )
df_comm = merge_data_files(
    books_file_list = ['../output/books-julien.json','../output/books-rebecca.json'],
    books_meta_file = '../output/books-meta-data.json',
    books_senti_file = '../output/vecteurs_sentiments_final2.json',
    books_users_file = '../output/users-data.json'
    )

In [218]:
# df_comm['year'] = df_comm['book_date'].str.extract(r'\b(\d{4})\b').replace('1900','').replace('3889',)
# df_comm['year'].unique()
df_comm

Unnamed: 0,book_id,book_nb_comm,title,name,surname,img_url,comm_id,user_id,note,date,appreciations,commentaire,gender,book_url,tags,book_rating_count,book_rating_value,book_author_url,book_editor,book_pages,sen_colère,sen_confiance,sen_honte,sen_joie,sen_peur,sen_réflexion,sen_surprise,sen_tristesse
0,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,2080694,706958,4.0,30 novembre 2019,85,Roadtrip désertique dans l'Ouest Américain en ...,,https://www.babelio.com/livres/Abbey-Le-gang-d...,"[['roman', 22], ['romans policiers et polars',...",678,3.96,/auteur/Edward-Abbey/23782,Gallmeister,491,0.211501,0.140185,0.009778,0.278578,0.077858,0.102662,0.069566,0.109873
1,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,1271915,319761,5.0,03 mars 2017,68,"De temps en temps, j'aime bien revenir vers l'...",F,https://www.babelio.com/livres/Abbey-Le-gang-d...,"[['roman', 22], ['romans policiers et polars',...",678,3.96,/auteur/Edward-Abbey/23782,Gallmeister,491,0.211501,0.140185,0.009778,0.278578,0.077858,0.102662,0.069566,0.109873
2,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,1093854,128814,5.0,27 juin 2016,67,"Depuis peu j'ai découvert grâce à "" Cardabelle...",M,https://www.babelio.com/livres/Abbey-Le-gang-d...,"[['roman', 22], ['romans policiers et polars',...",678,3.96,/auteur/Edward-Abbey/23782,Gallmeister,491,0.211501,0.140185,0.009778,0.278578,0.077858,0.102662,0.069566,0.109873
3,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,2711430,227017,4.5,22 juillet 2021,64,"Ils sont quatre. \nUne blonde , jeunette, venu...",M,https://www.babelio.com/livres/Abbey-Le-gang-d...,"[['roman', 22], ['romans policiers et polars',...",678,3.96,/auteur/Edward-Abbey/23782,Gallmeister,491,0.211501,0.140185,0.009778,0.278578,0.077858,0.102662,0.069566,0.109873
4,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,2685514,138438,4.5,26 juin 2021,64,Avez-vous entendu cette info effrayante ? « La...,F,https://www.babelio.com/livres/Abbey-Le-gang-d...,"[['roman', 22], ['romans policiers et polars',...",678,3.96,/auteur/Edward-Abbey/23782,Gallmeister,491,0.211501,0.140185,0.009778,0.278578,0.077858,0.102662,0.069566,0.109873
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314004,609937,59,Seuls les vautours,Nicolas,Zeimet,/couv/CVT_CVT_Seuls-les-vautours_5777.jpg,1057334,166603,5.0,09 mai 2016,2,"J'avais connu Nicolas avec son livre ""Comme un...",F,https://www.babelio.com/livres/Zeimet-Seuls-le...,"[['roman', 18], ['suspense', 18], ['roman noir...",114,4.05,/auteur/Nicolas-Zeimet/313288,10-18,552,0.077515,0.096419,0.023236,0.217427,0.152701,0.071533,0.183483,0.177687
314005,609937,59,Seuls les vautours,Nicolas,Zeimet,/couv/CVT_CVT_Seuls-les-vautours_5777.jpg,939776,262506,5.0,07 décembre 2015,2,J'avais lu beaucoup de bien de ce livre sur Am...,,https://www.babelio.com/livres/Zeimet-Seuls-le...,"[['roman', 18], ['suspense', 18], ['roman noir...",114,4.05,/auteur/Nicolas-Zeimet/313288,10-18,552,0.077515,0.096419,0.023236,0.217427,0.152701,0.071533,0.183483,0.177687
314006,609937,59,Seuls les vautours,Nicolas,Zeimet,/couv/CVT_CVT_Seuls-les-vautours_5777.jpg,3027068,306182,2.5,05 mai 2022,1,"A Duncan's Creek, un des plus petits villages ...",M,https://www.babelio.com/livres/Zeimet-Seuls-le...,"[['roman', 18], ['suspense', 18], ['roman noir...",114,4.05,/auteur/Nicolas-Zeimet/313288,10-18,552,0.077515,0.096419,0.023236,0.217427,0.152701,0.071533,0.183483,0.177687
314007,609937,59,Seuls les vautours,Nicolas,Zeimet,/couv/CVT_CVT_Seuls-les-vautours_5777.jpg,2027517,14577,,02 octobre 2019,1,C'est pas écrit c'est tapé !,F,https://www.babelio.com/livres/Zeimet-Seuls-le...,"[['roman', 18], ['suspense', 18], ['roman noir...",114,4.05,/auteur/Nicolas-Zeimet/313288,10-18,552,0.077515,0.096419,0.023236,0.217427,0.152701,0.071533,0.183483,0.177687


In [219]:
# group by per book
columns_senti = [col for col in df_comm.columns if col.startswith('sen_')]
columns_book = ['book_id', 'book_url', 'book_nb_comm', 'title', 'name', 'surname',
       'tags', 'img_url', 'book_rating_count', 'book_rating_value',
       'book_author_url', 'book_editor', 'book_pages', *columns_senti]

def reduce_comm_to_books(df):
    return df.copy().groupby(columns_book, as_index=False).count().loc[:,columns_book]

In [220]:
df_books = reduce_comm_to_books(df_comm)

We have now 2 dataset : df_comm and df_books with 2 differents aggregate levels

# Preprocessing df_books

In [221]:
# on garde que les tags avec le nom dans filter_list ou si la valeur est supérieur à filter_force_min. ca permet de rétirer les tags rares et peu importants
def tags_to_cols(df, col_name, filter_list=None, filter_force_min=24):
    df1 = df.copy()

    for index,row in df.iterrows():
        tags_as_string = row[col_name]
        tags = ast.literal_eval(tags_as_string)

        for tag in tags:
            if filter_list is not None and not tag[0].strip() in filter_list and tag[1] < filter_force_min:
                continue
            tag_name = 'tag_'+tag[0].strip().replace(' ','_').lower()
            df1.loc[df1.index == index, tag_name] = tag[1]

    df1 = df1.fillna(0)
    return df1

In [222]:
df_books = tags_to_cols(df_books, col_name='tags', filter_list=list(THEMES.keys()), filter_force_min=24)

In [223]:
df_books.to_json('../output/final/data-books.json',lines=True,orient='records')
df_comm.to_json('../output/final/data-comm.json',lines=True,orient='records')
df_comm['commentaire'] = ''
df_comm.to_json('../output/final/data-comm-light.json',lines=True,orient='records')

In [224]:
df_comm

Unnamed: 0,book_id,book_nb_comm,title,name,surname,img_url,comm_id,user_id,note,date,appreciations,commentaire,gender,book_url,tags,book_rating_count,book_rating_value,book_author_url,book_editor,book_pages,sen_colère,sen_confiance,sen_honte,sen_joie,sen_peur,sen_réflexion,sen_surprise,sen_tristesse
0,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,2080694,706958,4.0,30 novembre 2019,85,,,https://www.babelio.com/livres/Abbey-Le-gang-d...,"[['roman', 22], ['romans policiers et polars',...",678,3.96,/auteur/Edward-Abbey/23782,Gallmeister,491,0.211501,0.140185,0.009778,0.278578,0.077858,0.102662,0.069566,0.109873
1,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,1271915,319761,5.0,03 mars 2017,68,,F,https://www.babelio.com/livres/Abbey-Le-gang-d...,"[['roman', 22], ['romans policiers et polars',...",678,3.96,/auteur/Edward-Abbey/23782,Gallmeister,491,0.211501,0.140185,0.009778,0.278578,0.077858,0.102662,0.069566,0.109873
2,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,1093854,128814,5.0,27 juin 2016,67,,M,https://www.babelio.com/livres/Abbey-Le-gang-d...,"[['roman', 22], ['romans policiers et polars',...",678,3.96,/auteur/Edward-Abbey/23782,Gallmeister,491,0.211501,0.140185,0.009778,0.278578,0.077858,0.102662,0.069566,0.109873
3,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,2711430,227017,4.5,22 juillet 2021,64,,M,https://www.babelio.com/livres/Abbey-Le-gang-d...,"[['roman', 22], ['romans policiers et polars',...",678,3.96,/auteur/Edward-Abbey/23782,Gallmeister,491,0.211501,0.140185,0.009778,0.278578,0.077858,0.102662,0.069566,0.109873
4,905033,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,2685514,138438,4.5,26 juin 2021,64,,F,https://www.babelio.com/livres/Abbey-Le-gang-d...,"[['roman', 22], ['romans policiers et polars',...",678,3.96,/auteur/Edward-Abbey/23782,Gallmeister,491,0.211501,0.140185,0.009778,0.278578,0.077858,0.102662,0.069566,0.109873
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314004,609937,59,Seuls les vautours,Nicolas,Zeimet,/couv/CVT_CVT_Seuls-les-vautours_5777.jpg,1057334,166603,5.0,09 mai 2016,2,,F,https://www.babelio.com/livres/Zeimet-Seuls-le...,"[['roman', 18], ['suspense', 18], ['roman noir...",114,4.05,/auteur/Nicolas-Zeimet/313288,10-18,552,0.077515,0.096419,0.023236,0.217427,0.152701,0.071533,0.183483,0.177687
314005,609937,59,Seuls les vautours,Nicolas,Zeimet,/couv/CVT_CVT_Seuls-les-vautours_5777.jpg,939776,262506,5.0,07 décembre 2015,2,,,https://www.babelio.com/livres/Zeimet-Seuls-le...,"[['roman', 18], ['suspense', 18], ['roman noir...",114,4.05,/auteur/Nicolas-Zeimet/313288,10-18,552,0.077515,0.096419,0.023236,0.217427,0.152701,0.071533,0.183483,0.177687
314006,609937,59,Seuls les vautours,Nicolas,Zeimet,/couv/CVT_CVT_Seuls-les-vautours_5777.jpg,3027068,306182,2.5,05 mai 2022,1,,M,https://www.babelio.com/livres/Zeimet-Seuls-le...,"[['roman', 18], ['suspense', 18], ['roman noir...",114,4.05,/auteur/Nicolas-Zeimet/313288,10-18,552,0.077515,0.096419,0.023236,0.217427,0.152701,0.071533,0.183483,0.177687
314007,609937,59,Seuls les vautours,Nicolas,Zeimet,/couv/CVT_CVT_Seuls-les-vautours_5777.jpg,2027517,14577,,02 octobre 2019,1,,F,https://www.babelio.com/livres/Zeimet-Seuls-le...,"[['roman', 18], ['suspense', 18], ['roman noir...",114,4.05,/auteur/Nicolas-Zeimet/313288,10-18,552,0.077515,0.096419,0.023236,0.217427,0.152701,0.071533,0.183483,0.177687


In [225]:
#df_jaccard = pd.read_csv('../output/jacsim.csv')
df_jaccard = pd.read_csv('../analyse/distance_jaccard.csv')
# df_jaccard = df_jaccard.rename(columns = {'Unnamed: 0':'book_id'})
# df_jaccard = df_jaccard.set_index('book_id')

# df1 = df_jaccard.query('book_id == 1390851').transpose().dropna().squeeze()
# df2 = df_jaccard.loc[:,'1390851'].dropna()

# pd.concat([df1,df2], axis=0)

In [231]:
df_jaccard

Unnamed: 0.1,Unnamed: 0,905033,34397,869119,1700,643614,1455462,877839,8240,1022159,1258576,1162306,946624,8929,614802,151916,4392,7470,608790,831363,1212813,58718,905055,504720,12753,741161,577391,534727,716654,155768,23516,1071543,534632,663462,153516,146383,857108,833098,77097,744251,1064062,850282,1329810,721421,557588,18080,79250,1031310,1396553,830643,...,3072,408821,994347,825688,488289,927362,527000,13140,22615,912572,850399,855482,1140111,106693,352809,1120091,532707,717851,1128082,1114735,3456,230307,7613,19326,792892,955937,10781,38158,572563,1116212,5103,6529,7167,874240,1131235,490122,117001,104388,865852,1358259,956505,529204,1329550,940817,646574,8697,611806,891791,889201,609937
0,905033,1.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,34397,0.862069,1.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,869119,0.850000,0.912281,1.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1700,0.844828,0.944444,0.894737,1.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,643614,0.680556,0.750000,0.718310,0.761194,1.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4561,8697,0.864407,0.894737,0.850000,0.877193,0.728571,0.761194,0.790323,0.915254,0.708333,0.877193,0.862069,0.781250,0.879310,0.892857,0.868852,0.862069,0.827586,0.857143,0.803279,0.456140,0.892857,0.816667,0.877193,0.862069,0.862069,0.827586,0.879310,0.830508,0.809524,0.800000,0.864407,0.890909,0.875000,0.818182,0.819672,0.841270,0.894737,0.875000,0.852459,0.819672,0.791045,0.847458,0.877193,0.866667,0.910714,0.803279,0.666667,0.676056,0.793103,...,0.877193,0.617284,0.723077,0.847458,0.836066,0.728571,0.857143,0.872727,0.813559,0.675676,0.762712,0.866667,0.774194,0.881356,0.816667,0.912281,0.890909,0.813559,0.718310,0.714286,0.830508,0.816667,0.790323,0.877193,0.857143,0.710145,0.852459,0.850000,0.946429,0.769231,0.662338,0.877193,0.872727,0.847458,0.614458,0.784615,0.830508,0.877193,0.769231,0.830508,0.847458,0.875000,0.859649,0.750000,0.816667,1.000000,,,,
4562,611806,0.864407,0.928571,0.881356,0.910714,0.753623,0.815385,0.790323,0.883333,0.708333,0.910714,0.862069,0.781250,0.946429,0.892857,0.838710,0.894737,0.827586,0.857143,0.774194,0.469027,0.927273,0.816667,0.877193,0.862069,0.862069,0.796610,0.847458,0.830508,0.868852,0.800000,0.896552,0.857143,0.875000,0.818182,0.850000,0.841270,0.862069,0.842105,0.822581,0.850000,0.791045,0.847458,0.945455,0.898305,0.910714,0.833333,0.689189,0.724638,0.793103,...,0.844828,0.658228,0.777778,0.879310,0.836066,0.753623,0.890909,0.872727,0.813559,0.722222,0.793103,0.866667,0.803279,0.881356,0.847458,0.847458,0.857143,0.877193,0.742857,0.764706,0.894737,0.879310,0.819672,0.910714,0.857143,0.735294,0.852459,0.850000,0.912281,0.825397,0.706667,0.877193,0.872727,0.847458,0.595238,0.841270,0.830508,0.877193,0.769231,0.862069,0.847458,0.875000,0.859649,0.803030,0.847458,0.864407,1.000000,,,
4563,891791,0.777778,0.774194,0.765625,0.758065,0.640000,0.690141,0.712121,0.769231,0.689189,0.786885,0.774194,0.681159,0.790323,0.800000,0.705882,0.746032,0.770492,0.796610,0.777778,0.411765,0.830508,0.761905,0.786885,0.774194,0.774194,0.770492,0.761905,0.746032,0.784615,0.741379,0.806452,0.766667,0.783333,0.789474,0.822581,0.761194,0.774194,0.754098,0.769231,0.765625,0.694444,0.761905,0.816667,0.781250,0.758065,0.777778,0.628205,0.680556,0.737705,...,0.758065,0.583333,0.727273,0.734375,0.753846,0.640000,0.827586,0.779661,0.730159,0.657895,0.709677,0.781250,0.696970,0.793651,0.761905,0.790323,0.796610,0.816667,0.675676,0.671233,0.746032,0.707692,0.738462,0.816667,0.796610,0.714286,0.742424,0.738462,0.790323,0.695652,0.625000,0.816667,0.779661,0.761905,0.581395,0.710145,0.774194,0.786885,0.695652,0.803279,0.761905,0.783333,0.770492,0.680556,0.761905,0.777778,0.777778,1.000000,,
4564,889201,0.819672,0.879310,0.866667,0.862069,0.718310,0.776119,0.806452,0.868852,0.675676,0.830508,0.816667,0.769231,0.896552,0.844828,0.825397,0.847458,0.783333,0.810345,0.819672,0.439655,0.844828,0.803279,0.830508,0.816667,0.816667,0.783333,0.864407,0.786885,0.796875,0.785714,0.881356,0.842105,0.827586,0.803571,0.777778,0.887097,0.847458,0.796610,0.868852,0.777778,0.753623,0.803279,0.862069,0.883333,0.894737,0.790323,0.636364,0.666667,0.779661,...,0.830508,0.609756,0.712121,0.864407,0.822581,0.742857,0.810345,0.824561,0.770492,0.712329,0.779661,0.822581,0.761905,0.866667,0.774194,0.864407,0.842105,0.800000,0.708333,0.704225,0.816667,0.803279,0.777778,0.830508,0.842105,0.700000,0.838710,0.806452,0.929825,0.757576,0.632911,0.830508,0.824561,0.803279,0.588235,0.772727,0.816667,0.830508,0.705882,0.816667,0.803279,0.859649,0.813559,0.764706,0.774194,0.881356,0.850000,0.765625,1.000000,


In [230]:
# df_books[df_books['title'].str.contains('gang', case=False)]

Unnamed: 0,book_id,book_url,book_nb_comm,title,name,surname,tags,img_url,book_rating_count,book_rating_value,book_author_url,book_editor,book_pages,sen_colère,sen_confiance,sen_honte,sen_joie,sen_peur,sen_réflexion,sen_surprise,sen_tristesse,tag_roman,tag_classique,tag_autobiographie,tag_drogue,tag_littérature_américaine,tag_fantastique,tag_jeunesse,tag_humour,tag_littérature_italienne,tag_journalisme,tag_bande_dessinée,tag_reportage,tag_guerre,tag_essai,tag_biographie,tag_politique,tag_nouvelles,tag_psychologie,tag_amour,tag_littérature_autrichienne,tag_aventure,tag_dystopie,tag_littérature_anglaise,tag_serie,tag_fantasy,tag_thriller,tag_romans_policiers_et_polars,tag_famille,tag_littérature_française,...,tag_cosy_mystery,tag_écrivain_femme,tag_côte_d'ivoire,tag_saga,tag_norvège,tag_indonésie,tag_campagne,tag_yoga,tag_années_60,tag_spiritisme,tag_confiance_en_soi,tag_confinement,tag_roman_d'apprentissage,tag_violon,tag_différence_d'âge,tag_détroits,tag_fin_de_vie,tag_musée,tag_naples,tag_alabama,tag_20ème_siècle,tag_années_20,tag_aborigènes,tag_lien_social,tag_pression_sociale,tag_téléphone_portable,tag_arabie_saoudite,tag_pulp,tag_parachute,tag_trisomie_21,tag_légendes,tag_graffiti,tag_mormons,tag_space-opera,tag_metamorphes,tag_romance_m-m,tag_résilience,tag_vaudou,tag_travail,tag_divorce,tag_asexualité,tag_aromantisme,tag_licorne,tag_alopécie,tag_héritage,tag_mystère,tag_destin,tag_gangs,tag_littérature_algérienne,tag_identité
2674,905033,https://www.babelio.com/livres/Abbey-Le-gang-d...,137,"Le gang de la clef à molette (Ne meurs pas, ô ...",Edward,Abbey,"[['roman', 22], ['romans policiers et polars',...",/couv/CVT_cvt_Le-gang-de-la-clef-a-molette_345...,678,3.96,/auteur/Edward-Abbey/23782,Gallmeister,491,0.211501,0.140185,0.009778,0.278578,0.077858,0.102662,0.069566,0.109873,22.0,0.0,0.0,0.0,26.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Simple Model

In [254]:
class BookReco:
    def __init__(self):
        self.data = None
        self.vectors = []
        self.scalars = []
        self.predict_scores = None
        self.weights = {}
        self.df_jaccard = None
        self.data_score = None

    def add_vector(self, col_prefix, weight=1):
        self.weights = {**self.weights, col_prefix: weight}
        self.vectors.append({'col_name': col_prefix})

    def add_jaccard(self, df, weight=1):
        self.weights = {**self.weights, 'jaccard': weight}
        self.df_jaccard = df

    def add_scalar(self, col_name, weight=1):
        self.weights = {**self.weights, col_name: weight}
        self.scalars.append({'col_name': col_name})

    def __get_cosine_similarity(self, prefix):
        data_temp = self.data.filter(regex=f'^{prefix}',axis=1)
        data_temp = data_temp.fillna(0)
        vec = MinMaxScaler().fit_transform(data_temp)
        return cosine_similarity(vec)

    def fit(self, data):
        self.data = data

        for i,vector in enumerate(self.vectors):
            feats_cs = self.__get_cosine_similarity(vector['col_name'])
            self.vectors[i] = {**vector, 'cosine_similar': feats_cs}

        for i,scalar in enumerate(self.scalars):
            X = self.data.loc[:,[scalar['col_name']]]
            X = MinMaxScaler().fit_transform(X)
            self.scalars[i] = {**scalar, 'scaled': X.reshape(-1)}

    def set_weight(self, weights:dict):
        self.weights = weights

    def __get_jaccard_score(self, book_id):
        df_jaccard = self.df_jaccard.copy()
        df_jaccard = df_jaccard.rename(columns = {'Unnamed: 0':'book_id'})
        df_jaccard = df_jaccard.set_index('book_id')

        try:
            df1 = df_jaccard.query('book_id == @book_id').transpose().dropna().squeeze()
            df2 = df_jaccard.loc[:, str(book_id)].dropna()
        except:
            print('Error Jaccard in __get_jaccard_score')
            return None

        if type(df1) is not pd.Series:
            return df2
        elif type(df2) is not pd.Series:
            return df1
        else:
            return pd.concat([df1,df2], axis=0)

    def predict(self, book_id):

        try:
            index_book = self.data.query('book_id == @book_id').index.values.astype(int)[0]
        except:
            print(f"Can't find book_id: {book_id} in the dataset")
            return None
        
        # init data_score to empty
        self.data_score = self.data[['book_id']]

        # get all scores, apply weight
        weight_sum = 0 # to normalize at the end, like a mean
        for vector in self.vectors:
            score = vector['cosine_similar'][index_book]
            weight = self.weights[vector['col_name']]
            weight_sum += weight

            self.data_score = pd.concat([self.data_score, pd.Series(score*weight, name=vector['col_name'])], axis=1)
        
        for scalar in self.scalars:
            weight = self.weights[scalar['col_name']]
            weight_sum += weight
            self.data_score = pd.concat([self.data_score, pd.Series(scalar['scaled']*weight, name=scalar['col_name'])], axis=1)

        self.data_score = self.data_score.set_index('book_id')

        
        if self.df_jaccard is not None:
            score = self.__get_jaccard_score(book_id) 

            if score is not None:
                score = score * self.weights['jaccard']
                weight_sum += self.weights['jaccard']
                self.data_score = self.data_score.merge(pd.Series(score, name='jaccard'), how='left', left_index=True, right_index=True)

        if weight_sum == 0:
            weight_sum == 1

        self.data_score =  self.data_score.sum(axis=1) / weight_sum

        data = self.data.merge(pd.Series(self.data_score, name='score'), how='left', left_on='book_id', right_index=True)

        return data.query('book_id != @book_id') # we pull off the asked book_id drom the scores

    def format_tojson(self, scores, max_books):

        output = []
        num = 0

        scores = scores.sort_values('score', ascending=False)

        for _, book in scores.iterrows():
            output.append({
                'title' : book['title'],
                'url' : book['book_url'],
                'image' : book['img_url'],
                'author' : book['surname']+' '+book['name'],
                'author_url' : book['book_author_url'],
                'book_rating_value' : book['book_rating_value'],
                'book_rating_count' : book['book_rating_count'],
                'score' : str(book['score']),
            })
            if num == max_books - 1:
                break
            num += 1
        return output


In [255]:
import modelreco

br = modelreco.BookReco()
br.add_vector('tag_', weight=0)
br.add_vector('sen_', weight=50)
br.add_jaccard(df_jaccard, weight=50)
br.add_scalar('book_rating_value', weight=10)
br.add_scalar('book_nb_comm', weight=0)
br.add_scalar('book_rating_count', weight=0)
br.fit(df_books)

# br.set_weight({'tag_': 0.5, 'sen_': 1, 'jaccard': 1, 'book_rating_value': 1, 'book_nb_comm': 0})
# scores = br.predict(123961) # De Cape et de Crocs, tome 2 : Pavillon noir !
# br.format_tojson(scores, max_books=5)

In [257]:
br.set_weight({'tag_': 0, 'sen_': 1, 'jaccard': 1, 'book_rating_value': 1, 'book_nb_comm': 0, 'book_rating_count': 0})
scores = br.predict(905033) # De Cape et de Crocs, tome 2 : Pavillon noir !
br.format_tojson(scores, max_books=5)

[{'title': 'Des souris et des hommes (Illustré)',
  'url': 'https://www.babelio.com/livres/Steinbeck-Des-souris-et-des-hommes-Illustre/1267072',
  'image': 'https://m.media-amazon.com/images/I/51kOxYxTISL._SX195_.jpg',
  'author': 'Steinbeck John',
  'author_url': '/auteur/John-Steinbeck/3357',
  'book_rating_value': 4.67,
  'book_rating_count': 231,
  'score': '0.8910962599660083'},
 {'title': 'Murena, tome 9 : Les épines',
  'url': 'https://www.babelio.com/livres/Dufaux-Murena-tome-9--Les-epines/476246',
  'image': '/couv/CVT_C_Murena-tome-9-Les-epines_6249.jpg',
  'author': 'Dufaux Jean',
  'author_url': '/auteur/Jean-Dufaux/2398',
  'book_rating_value': 4.32,
  'book_rating_count': 178,
  'score': '0.8801842098057918'},
 {'title': 'Lonesome Dove, tome 2',
  'url': 'https://www.babelio.com/livres/McMurtry-Lonesome-Dove-tome-2/938697',
  'image': '/couv/CVT_9782351785089.jpg',
  'author': 'McMurtry Larry',
  'author_url': '/auteur/Larry-McMurtry/18352',
  'book_rating_value': 4.64,
 

In [228]:
import pickle
import dill

with open('../output/final/model-reco.obj', "wb") as f:
    dill.dump(br, f)

# with open('data/model-reco.obj', 'wb') as f:
#     pickle.dump(br, f, pickle.HIGHEST_PROTOCOL)