In [1]:
import gzip
import json
import re
import os
import sys
import pickle
import spacy
import matplotlib.pyplot as plt
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from itertools import chain

plt.style.use('ggplot')
plt.rcParams['font.family'] = 'sans-serif' 
plt.rcParams['font.serif'] = 'Ubuntu' 
plt.rcParams['font.monospace'] = 'Ubuntu Mono' 
plt.rcParams['font.size'] = 14 
plt.rcParams['axes.labelsize'] = 12 
plt.rcParams['axes.labelweight'] = 'bold' 
plt.rcParams['axes.titlesize'] = 12 
plt.rcParams['xtick.labelsize'] = 12 
plt.rcParams['ytick.labelsize'] = 12 
plt.rcParams['legend.fontsize'] = 12 
plt.rcParams['figure.titlesize'] = 12 
plt.rcParams['image.cmap'] = 'jet' 
plt.rcParams['image.interpolation'] = 'none' 
plt.rcParams['figure.figsize'] = (12, 10) 
plt.rcParams['axes.grid']=True
plt.rcParams['lines.linewidth'] = 2 
plt.rcParams['lines.markersize'] = 8
colors = ['xkcd:pale orange', 'xkcd:sea blue', 'xkcd:pale red', 'xkcd:sage green', 'xkcd:terra cotta', 'xkcd:dull purple', 'xkcd:teal', 'xkcd: goldenrod', 'xkcd:cadet blue',
'xkcd:scarlet']


## Loading the data

In [2]:
data = pd.read_pickle('5p_books.pickle')
data = data.sample(frac = 0.25, random_state=57, ignore_index=True)

In [3]:
data= data.loc[0:1000]

In [5]:
def text_process(text:str, nlp, lemmatize = False):
    """
    Use spacy lemmatizer to tokenize or lemmatize text. Remove stopwords, punctuation
    Input: text (string)
    Output: list of tokens """
    if lemmatize:
        return ' '.join([token.lemma_ for token in nlp(text.strip(), disable=["parser", "ner"]) if not token.is_stop and not token.is_punct ])
    else:
        
        text = ' '.join([token.text for token in nlp(text.strip()) 
                     if not token.like_email and not token.like_url and not token.is_space ])

        return text

def genre_extractor(d):
    """Extract keys from dict into a flatten list"""
    keys = [key.split(',') for key in d.keys()]
    keys = list(chain(*keys))
    keys = [key.strip() for key in keys]
    return keys



def unpack_list(l):
    """Unpack elements of a list and join them into a string"""
    return ', '.join([item for item in l])

# print(genre_extractor(df.genres.iloc[0]))
# print(unpack_list(df.series.iloc[0]))

In [6]:
# drop empty rows
data = data.dropna(how = 'all')
#Get list of genres
data['genres_key'] = data['genres'].apply(genre_extractor)
#Lemmatize, remove stopwords, punctuation
data['Clean_description'] = data.description#.apply(text_process, nlp = nlp, lemmatize = False)
# unpacking lists
data['genres_key'] = data.genres_key.apply(unpack_list)
data['series'] = data.series.apply(unpack_list)
data['similar_books'] = data.similar_books.apply(unpack_list)
data['authors']= data.authors.apply(unpack_list)


In [7]:
#Load the sentence transformer model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [8]:
columns =['Clean_description', 'authors','title_without_series', 'publisher','series', 'genres_key', 'is_ebook', 'similar_books', 'format', 'num_pages', 'publication_year']
data_ = data[['Clean_description', 'authors','title_without_series', 'publisher','series', 'genres_key', 'is_ebook', 'similar_books', 'format', 'num_pages', 'publication_year']]
for col in columns :
    print(type(data[col].iloc[0]))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


In [29]:
n = len(columns)
d = data.shape[0]
sims = np.zeros([n, d, d])

In [35]:
count = 0
for col in  columns:
      
    
    print(col)
#     if X is not str:
#         print('not str')
#         X = data[col].apply(lambda x : x[0] if len(X)>0 else '')
#         # for word in X : 
#         #     X2 = X2 + word[0]  
    #Convert description column in data_ DataFrame to numpy array
    X = np.array(data[col])
    # Convert the data to string type
    X = X.astype(str)
    # Get the embeddings for the text data
    text_data = X
    embeddings = model.encode(text_data, show_progress_bar=True)
#     if count == 1 :
#         X_st = embeddings
#     else:
#         X_st = np.column_stack((X_st, embeddings))
    sims[count, :, :] = cosine_similarity(embeddings)
    count += 1  
#     print(X_st.shape)

Clean_description


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

authors


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

title_without_series


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

publisher


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

series


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

genres_key


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

is_ebook


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

similar_books


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

format


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

num_pages


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

publication_year


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [57]:
coefficients = {
    'Clean_description':5,
    'authors':1,
    'title_without_series':1,
    'publisher':1,
    'series':1,
    'genres_key':0,
    'is_ebook':0.5,
    'similar_books':0.1,
    'format':0.3,
    'num_pages':1,
    'publication_year':1,
}
normalization = 0
for k, v in coefficients.items():
    normalization += v
print(normalization)
    

11.9


In [58]:
sims_total = np.zeros([d,d])
for i , col in zip(range(n), columns):
    sims_total += sims[i, : , :]*coefficients[col]

In [59]:
sims_total/normalization

array([[0.99999998, 0.49303222, 0.64209377, ..., 0.57587751, 0.53909279,
        0.57535585],
       [0.49303222, 0.99999986, 0.5470101 , ..., 0.46407578, 0.41969817,
        0.52544527],
       [0.64209377, 0.5470101 , 0.99999983, ..., 0.63169989, 0.5779965 ,
        0.61852838],
       ...,
       [0.57587751, 0.46407578, 0.63169989, ..., 0.99999999, 0.57883696,
        0.67467715],
       [0.53909279, 0.41969817, 0.5779965 , ..., 0.57883696, 1.00000011,
        0.58645966],
       [0.57535585, 0.52544527, 0.61852838, ..., 0.67467715, 0.58645966,
        0.99999993]])

In [60]:
cos_sim_data.loc[0]

0       1.000000
1       0.453980
2       0.712427
3      -0.008874
4       0.626640
          ...   
996     0.683792
997     0.630460
998     0.691473
999     0.585552
1000    0.655160
Name: 0, Length: 1001, dtype: float64

In [61]:
cos_sim_data = pd.DataFrame(sims_total/normalization)
def give_recommendations(index, features=[]):
    # Get the 5 most similar book indexes to the input index
    index_recomm = cos_sim_data.loc[index].sort_values(ascending=False).index.tolist()[1:6]
    # Get the titles of the 5 most similar books
    books_recomm = data['title_without_series'].loc[index_recomm].values
    # Store the titles and indexes of the recommended books in a dictionary
    result = {'Books': books_recomm, 'Index': index_recomm}

    # If 'recommendation' is in the list of features, print the read book and the recommended books
    if 'recommendation' in features:
        print('The read book is this one: %s \n' % (data['title_without_series'].loc[index]))
        for i, book in enumerate(books_recomm):
            print(f'The number {i + 1} recommended book is this one: {book} \n')

    # Loop over the features list and print the value of each feature for the read book and the recommended books
    if 'Clean_description' in features:
        print('The plot of the read book is this one:\n %s \n' % (data['Clean_description'].loc[index]))
        for i in range(len(books_recomm)):
            plot_q = data['Clean_description'].loc[index_recomm[i]]
            print(f'The plot of the number {i + 1} recommended book is this one:\n {plot_q} \n')

    # Loop over the features list and print the value of each feature for the read book and the recommended books
    for feature in ['authors', 'publisher', 'series', 'is_ebook', 'similar_books', 'genres_key', 'format', 'num_pages', 'publication_year']:
        if feature in features:
            print(f'The {feature} of the read book is this one:\n {data[feature].loc[index]} \n')
            for i in range(len(books_recomm)):
                print(f'The {feature} of the number {i + 1} recommended book is this one:\n {data[feature].loc[index_recomm[i]]} \n')
                
    # Return the result dictionary
    return result

In [62]:
recomm_list = []
# Loop through all items in X 
for i in range(data.shape[0]):
  # Get the recommendations for each item by calling give_recommendations() function
  recomm_i = give_recommendations(i)
  # Store the recommended books for each item in the list "recomm_list"
  recomm_list.append(recomm_i['Books'])
recomm_data = pd.DataFrame(recomm_list,columns=['First Recommendation','Second Recommendation','Third Recommendation','Fourth Recommendation','Fifth Recommendation'])
# Create a dataframe "recomm_data" from "recomm_list" with columns for each recommended book and "Read Book"
recomm_data['Read Book'] = data['title_without_series']
# Reorder the columns to show "Read Book" first, followed by the 5 recommended books.
recomm_data = recomm_data[['Read Book','First Recommendation','Second Recommendation','Third Recommendation','Fourth Recommendation','Fifth Recommendation']]

In [63]:
# Shuffle the data and return the first 5 rows as a sample
recomm_data.sample(frac=1).head()

Unnamed: 0,Read Book,First Recommendation,Second Recommendation,Third Recommendation,Fourth Recommendation,Fifth Recommendation
210,The Gomorrah Principle,Sleeper Cell,Spinner of Yarns,Journal of a Cavalry Bugler,Steel Hearts,This Is Life
523,"The Alleyman (No Man's World, #3)",Out of Nowhere,Wrath of the Lemming Men (Chronicles of Isamba...,"Shockwave (Urban Outlaws, #5)","Pubs, Pulpits & Prairie Fires",Clawed!: A Choose Your Own Ending Horror Adven...
518,A Corkscrew is Most Useful: The Travellers of ...,Contemporary Marketing 2011,How to Succeed in the Game of Life: 34 Intervi...,"Pubs, Pulpits & Prairie Fires",Alone Beneath the Heaven: A gripping saga of e...,"Lee de Forest: King of Radio, Television, and ..."
470,The Snow Queen (Tales of the Five Hundred King...,The Secret Life of Ancient Bristlecone Pines,"Enchantress, Sorceress, Madwoman: The True Sto...",THE PROMISE - THE ISLAND OF COMMITMENT (HUGO T...,Lady Bird: A Biography of Mrs. Johnson,Christmas Stranger
660,"Foretold (Daughters Of Saraqael, #3)",Getting It Right This Time,"Froi of the Exiles (Lumatere Chronicles, #2)",Spring Comes to Sanctuary (Welcome to Sanctuar...,Merle und die Fließende Königin (Merle-Trilogi...,"Motherlines (Holdfast Chronicles, #2)"
