## Import des librairies et des fichiers

In [3]:
import requests
import pandas as pd
import json
import numpy as np
import plotly.graph_objects as gp
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from functools import reduce
from datetime import date,timedelta


In [5]:
IN_SOURCE = "data-comm.json"
IN_LEXIQUE = "lexique_emotions.csv"

START = 0
STOP = 1000 # max = 4566

OUT_SAVED_DATAFRAME = f"df_nlp{START}_{STOP}.csv"
OUT_VECTEURS = "vecteurs_sentiments.json"
OUT_JACCARD = "distance_jaccard.csv"

In [None]:
# Import des résultats du ratissage de Babelio

df=pd.read_json(IN_SOURCE,lines=True)

In [None]:
df = df[df["book_id"].isin(df["book_id"].unique()[START:STOP])]

In [None]:
# Import du lexique des émotions et transformation en dictionnaire

emotions=pd.read_csv(IN_LEXIQUE,delimiter=";")
dict_emotions={m:s for m,s in zip(emotions["mot"].tolist(),emotions["sentiment"].tolist())}
dict_emotions

In [15]:
# Import et chargement de la librairie Spacy en français

import spacy
nlp = spacy.load("fr_core_news_sm")



## Prétraitement de texte

In [16]:
# Retrait des retour-chariot dans les commentaires

df["commentaire"]=df["commentaire"].apply(lambda x : x.replace("\n","").replace("\t",""))

In [17]:
#df["commentaire"]=df["commentaire"].apply(lambda x : " ".join(x))

In [18]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [19]:
# Retrait des caractères spéciaux et mise en minuscules des mots

df['com_process'] = df['commentaire'].str.replace(r"[!\"#$%&()*+,-./:;•♪♫<=>«»?@[\\\]^_`{|}~]+", " ")
df['com_process'] = df['com_process'].fillna('').apply(lambda x: x.lower())
df.head(5)

  df['com_process'] = df['commentaire'].str.replace(r"[!\"#$%&()*+,-./:;•♪♫<=>«»?@[\\\]^_`{|}~]+", " ")


Unnamed: 0,book_id,book_nb_comm,title,name,surname,img_url,comm_id,user_id,note,date,...,book_pages,sen_colère,sen_confiance,sen_honte,sen_joie,sen_peur,sen_réflexion,sen_surprise,sen_tristesse,com_process
68425,1276725,39,Friday Black,Nana Kwame,Adjei-Brenyah,https://m.media-amazon.com/images/I/41MRJp7nKA...,2559328,489894,4.5,13 mars 2021,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,j'ai lu ces douze nouvelles estomaquée par la ...
68426,1276725,39,Friday Black,Nana Kwame,Adjei-Brenyah,https://m.media-amazon.com/images/I/41MRJp7nKA...,2662054,357796,4.5,04 juin 2021,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,il est très rare que je commence la présentati...
68427,1276725,39,Friday Black,Nana Kwame,Adjei-Brenyah,https://m.media-amazon.com/images/I/41MRJp7nKA...,2488538,645491,4.5,15 janvier 2021,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,nana kwame adjel brenyah signe avec son ensemb...
68428,1276725,39,Friday Black,Nana Kwame,Adjei-Brenyah,https://m.media-amazon.com/images/I/41MRJp7nKA...,2525945,353118,3.0,14 février 2021,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,si vous suivez un peu les réseaux sociaux ou l...
68429,1276725,39,Friday Black,Nana Kwame,Adjei-Brenyah,https://m.media-amazon.com/images/I/41MRJp7nKA...,2503612,196280,3.0,28 janvier 2021,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,il est des livres que tu ouvres et qui dès les...


In [21]:
df["note"]=df["note"].replace("",np.nan)

In [22]:
df["note"]=df["note"].astype("float")

In [23]:
df["note_p"]=df["note"]/5

In [24]:
# Import des mots-vides de la langue française

from spacy.lang.fr.stop_words import STOP_WORDS as mots_vides

In [26]:
# Ajout de certains mots récurrents aux mots-vides

nlp.Defaults.stop_words.add("d")
nlp.Defaults.stop_words.add("l")
nlp.Defaults.stop_words.add("n")
nlp.Defaults.stop_words.add("livre")
nlp.Defaults.stop_words.add("roman")
nlp.Defaults.stop_words.add("auteur")
nlp.Defaults.stop_words.add("personnage")
nlp.Defaults.stop_words.add("personnages")

In [27]:
# Ajout des noms et prénoms des auteurs et autrices aux mots-vides

for e in df["name"].unique():
    nlp.Defaults.stop_words.add(str.lower(e))

for e in df["surname"].unique():
    nlp.Defaults.stop_words.add(str.lower(e))

In [None]:
# Tokenization des commentaires en prétraitement

tokenized_doc = df['com_process'].fillna('').apply(lambda x: nlp(x))

In [None]:
# Retrait effectif des mots-vides

df["tokenized"] = tokenized_doc.apply(lambda x: [token for token in x if token.text not in mots_vides])

In [None]:
# Finalisation du prétraitement de texte des commentaires

df["com_final"] = df["tokenized"].apply(lambda x: ' '.join(str(v) for v in x))

In [None]:
# Isolement des mots liés aux sentiments répertoriés dans le dictionnaire des émotions

df["filtre_sentiments"] = tokenized_doc.apply(lambda x: [token for token in x if token.text not in mots_vides and token.text in dict_emotions.keys()])


In [None]:
# Identification des sentiments correspondants

df["sentimentest"] = tokenized_doc.apply(lambda x: [dict_emotions[token.text] for token in x if token.text not in mots_vides and token.text in dict_emotions.keys()])


In [None]:
# Finalisation du prétraitement focalisé sur les sentiments

df["sentiments_final"] = df["sentimentest"].apply(lambda x: ' '.join(str(v) for v in x))

In [None]:
df["tokenized_doc"]=tokenized_doc

In [None]:
# Enregistrement du dataframe au format csv pour utilisations futures

df.to_csv(OUT_SAVED_DATAFRAME)

## Création des vecteurs de sentiments

In [None]:
# Import de TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, 
    min_df=2, 
    max_features=1000, 
    #stop_words='english'
)

In [None]:
# Création du dataframe df_vect pour générer les vecteurs de sentiments à partir de leurs tf-idf par livre

liste_emotions=["joie","confiance","surprise","peur","tristesse","colère","honte","réflexion"]

dict_vect = {}

for bookid,title in zip(df["book_id"].unique(),df["title"].unique()):
    tfidf = tfidf_vectorizer.fit_transform(df[df["title"] == title]["sentiments_final"].values.astype("U"))
    tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
    df_tfidf = pd.DataFrame(tfidf.toarray(), columns=[x for x in tfidf_vectorizer.get_feature_names_out()]).reset_index()
    df_def = df_tfidf.describe().T.iloc[1:,:]["mean"]
    df_to_vec = pd.DataFrame({"book_id":[bookid]*len(liste_emotions),
                              "title":[title]*len(liste_emotions),
                              "emotion":liste_emotions,
                              "valeur":[df_def.loc[em] if em in df_def.index else 0 for em in liste_emotions]
                             })

    df_to_vec["poids"] = df_to_vec["valeur"]/df_to_vec["valeur"].sum()

    dict_vect[title] = df_to_vec

df_vect = pd.concat(dict_vect.values())

df_vect



Unnamed: 0,book_id,title,emotion,valeur,poids
0,1276725,Friday Black,joie,0.129979,0.095229
1,1276725,Friday Black,confiance,0.137345,0.100626
2,1276725,Friday Black,surprise,0.283353,0.207598
3,1276725,Friday Black,peur,0.108714,0.079649
4,1276725,Friday Black,tristesse,0.078123,0.057237
...,...,...,...,...,...
3,23651,"Le poids des secrets, tome 5 : Hotaru",peur,0.057313,0.056781
4,23651,"Le poids des secrets, tome 5 : Hotaru",tristesse,0.165382,0.163847
5,23651,"Le poids des secrets, tome 5 : Hotaru",colère,0.035192,0.034865
6,23651,"Le poids des secrets, tome 5 : Hotaru",honte,0.041037,0.040656


In [None]:
# À partir de df_vect, création du dataframe final des vecteurs de sentiments par livre

dft = pd.pivot_table(df_vect,index=["book_id","title"],columns=["emotion"],values=["poids"])
dft.columns = dft.columns.droplevel(0)
dft=dft.reset_index()
dft.columns = dft.columns[0:2].tolist()+["sen_"+name for name in dft.columns[2:len(dft.columns)]]
dft

Unnamed: 0,book_id,title,sen_colère,sen_confiance,sen_honte,sen_joie,sen_peur,sen_réflexion,sen_surprise,sen_tristesse
0,1526,Amok ou Le fou de Malaisie,0.155309,0.094753,0.072041,0.124737,0.108053,0.100327,0.188378,0.156403
1,1576,"Le Photographe, tome 2",0.091832,0.038548,0.000000,0.212076,0.075268,0.138705,0.173354,0.270216
2,1579,T'embrasser sous la neige,0.035184,0.231846,0.020667,0.279593,0.061139,0.058607,0.098503,0.214462
3,1627,Hiver à Sokcho,0.075806,0.092384,0.000000,0.247943,0.057484,0.074805,0.247009,0.204570
4,1790,Le Mur,0.057197,0.089888,0.000000,0.206190,0.106084,0.255463,0.111012,0.174166
...,...,...,...,...,...,...,...,...,...,...
994,1432929,"Largo Winch, tome 1 : L'héritier",0.052815,0.082936,0.000000,0.535717,0.000000,0.117491,0.112610,0.098431
995,1433127,"Qui ne se plante pas, ne pousse jamais",0.035518,0.263163,0.000000,0.342666,0.022544,0.118882,0.082660,0.134567
996,1433512,Le cas Chakkamuk,0.051469,0.136757,0.000000,0.374032,0.053511,0.117528,0.192078,0.074624
997,1439561,La Honte,0.110169,0.067594,0.388701,0.104287,0.026968,0.125965,0.060917,0.115399


In [None]:
# Chargement dans un fichier json

dft.to_json(OUT_VECTEURS,lines=True,orient="records")

In [None]:
aaa

## Distances Jaccard

In [None]:
group_id_list = [0, 1000, 2000, 3000, 4000, 4566]

csv_files = [f"df_nlp{group_id_list[i]}_{group_id_list[i+1]}.csv" for i in range(len(group_id_list)-1)]


In [None]:
ds1=pd.read_csv(csv_files[0])

ds2=pd.read_csv(csv_files[1])

ds3=pd.read_csv(csv_files[2])

ds4=pd.read_csv(csv_files[3])

ds5=pd.read_csv(csv_files[4])

In [None]:
df_j = pd.concat([ds1,ds2,ds3,ds4,ds5])

In [None]:
# Initiation d'un dataframe vide sous forme de matrice carré 1500 x 1500 pour le présent exemple où 1500 est le nombre de livres

mtx2 = pd.DataFrame(index=df_j["book_id"].unique(),columns=df_j["book_id"].unique())

In [None]:
# Création de la fonction jaccatd_similarity pour la calculer la distance Jaccard entre les commentaires de deux livres donnés

def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(set(list1)) + len(set(list2))) - intersection
    return float(intersection) / union

In [None]:
# Remplissage du dataframe de manière à ce qu'il représente une matrice carrée triangulaire inférieure pour éviter les doublons dans les combinaisons et gagner du temps de calcul

for i,id in enumerate(df_j["book_id"].unique()):

    for idd in df_j["book_id"].unique()[i:len(df_j["book_id"].unique())+1]:

        #v1 = df_vect[df_vect["book_id"]==id]["poids"].tolist()
        #v2 = df_vect[df_vect["book_id"]==idd]["poids"].tolist()

        #simcos = 1 - spatial.distance.cosine(v1, v2)

        w1=" ".join(list(set(" ".join([x for x in df_j[df_j["book_id"]==id]["com_final"].tolist() if isinstance(x, str)]).split(" "))))
        w2=" ".join(list(set(" ".join([x for x in df_j[df_j["book_id"]==idd]["com_final"].tolist() if isinstance(x, str)]).split(" "))))

        jacsim = jaccard_similarity(w1,w2)

        #coef = simcos * jacsim

        mtx2.loc[idd,id]=jacsim

    print(i,"done")

0 done
1 done
2 done
3 done
4 done
5 done
6 done
7 done
8 done
9 done
10 done
11 done
12 done
13 done
14 done
15 done
16 done
17 done
18 done
19 done
20 done
21 done
22 done
23 done
24 done
25 done
26 done
27 done
28 done
29 done
30 done
31 done
32 done
33 done
34 done
35 done
36 done
37 done
38 done
39 done
40 done
41 done
42 done
43 done
44 done
45 done
46 done
47 done
48 done
49 done
50 done
51 done
52 done
53 done
54 done
55 done
56 done
57 done
58 done
59 done
60 done
61 done
62 done
63 done
64 done
65 done
66 done
67 done
68 done
69 done
70 done
71 done
72 done
73 done
74 done
75 done
76 done
77 done
78 done
79 done
80 done
81 done
82 done
83 done
84 done
85 done
86 done
87 done
88 done
89 done
90 done
91 done
92 done
93 done
94 done
95 done
96 done
97 done
98 done
99 done
100 done
101 done
102 done
103 done
104 done
105 done
106 done
107 done
108 done
109 done
110 done
111 done
112 done
113 done
114 done
115 done
116 done
117 done
118 done
119 done
120 done
121 done
122 done
123

In [None]:
# Initiation d'un dataframe vide sous forme de matrice carré 1500 x 1500 pour le présent exemple où 1500 est le nombre de livres

mtx2 = pd.DataFrame(index=df["book_id"].unique(),columns=df_jaccard["book_id"].unique())

In [None]:
# Création de la fonction jaccatd_similarity pour la calculer la distance Jaccard entre les commentaires de deux livres donnés

def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(set(list1)) + len(set(list2))) - intersection
    return float(intersection) / union

In [None]:
# Remplissage du dataframe de manière à ce qu'il représente une matrice carrée triangulaire inférieure pour éviter les doublons dans les combinaisons et gagner du temps de calcul

for i,id in enumerate(df["book_id"].unique()):

    for idd in df["book_id"].unique()[i:len(df["book_id"].unique())+1]:

        #v1 = df_vect[df_vect["book_id"]==id]["poids"].tolist()
        #v2 = df_vect[df_vect["book_id"]==idd]["poids"].tolist()

        #simcos = 1 - spatial.distance.cosine(v1, v2)

        w1=" ".join(list(set(" ".join([x for x in df[df["book_id"]==id]["com_final"].tolist() if isinstance(x, str)]).split(" "))))
        w2=" ".join(list(set(" ".join([x for x in df[df["book_id"]==idd]["com_final"].tolist() if isinstance(x, str)]).split(" "))))

        jacsim = jaccard_similarity(w1,w2)

        #coef = simcos * jacsim

        mtx2.loc[idd,id]=jacsim

    print(i,"done")

0 done
1 done
2 done
3 done
4 done
5 done
6 done
7 done
8 done
9 done
10 done
11 done
12 done
13 done
14 done
15 done
16 done
17 done
18 done
19 done
20 done
21 done
22 done
23 done
24 done
25 done
26 done
27 done
28 done
29 done
30 done
31 done
32 done
33 done
34 done
35 done
36 done
37 done
38 done
39 done
40 done
41 done
42 done
43 done
44 done
45 done
46 done
47 done
48 done
49 done
50 done
51 done
52 done
53 done
54 done
55 done
56 done
57 done
58 done
59 done
60 done
61 done
62 done
63 done
64 done
65 done
66 done
67 done
68 done
69 done
70 done
71 done
72 done
73 done
74 done
75 done
76 done
77 done
78 done
79 done
80 done
81 done
82 done
83 done
84 done
85 done
86 done
87 done
88 done
89 done
90 done
91 done
92 done
93 done
94 done
95 done
96 done
97 done
98 done
99 done
100 done
101 done
102 done
103 done
104 done
105 done
106 done
107 done
108 done
109 done
110 done
111 done
112 done
113 done
114 done
115 done
116 done
117 done
118 done
119 done
120 done
121 done
122 done
123

In [None]:
# Affichage de la matrice

mtx2

Unnamed: 0,1276725,1339387,497362,502868,7147,18360,1299907,1266997,799662,864081,...,1421651,1094410,929078,1302616,38862,585264,842092,561336,23651,36720
1276725,1.0,,,,,,,,,,...,,,,,,,,,,
1339387,0.793651,1.0,,,,,,,,,...,,,,,,,,,,
497362,0.766667,0.813559,1.0,,,,,,,,...,,,,,,,,,,
502868,0.827586,0.877193,0.923077,1.0,,,,,,,...,,,,,,,,,,
7147,0.710145,0.75,0.75,0.806452,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585264,0.819672,0.866667,0.875,0.909091,0.772727,0.910714,0.761905,0.576087,0.894737,0.852459,...,0.626506,0.816667,0.883333,0.896552,0.894737,1.0,,,,
842092,0.774194,0.819672,0.890909,0.925926,0.757576,0.927273,0.774194,0.565217,0.910714,0.866667,...,0.614458,0.830508,0.866667,0.847458,0.813559,0.87931,1.0,,,
561336,0.8,0.847458,0.924528,0.961538,0.78125,0.962264,0.8,0.56044,0.944444,0.864407,...,0.609756,0.859649,0.864407,0.877193,0.842105,0.910714,0.927273,1.0,,
23651,0.770492,0.816667,0.888889,0.924528,0.753846,0.925926,0.770492,0.543478,0.909091,0.833333,...,0.590361,0.827586,0.833333,0.844828,0.842105,0.877193,0.892857,0.925926,1.0,


In [None]:
# Enregistrement de la matrice des distances Jaccard au format csv

mtx2.to_csv(OUT_JACCARD)