In [1]:
import pandas as pd
import numpy as np

In [32]:
# A very useful guide for this project: 
# https://www.kdnuggets.com/building-a-recommendation-system-with-hugging-face-transformers

# Future goal: In streamlit, can make a web app and show the thumbnail of the book title

In [3]:
df = pd.read_csv('./data/cleaned_df.csv')

In [4]:
df.head()

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002261982,0002261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
1,9780006380832,0006380832,Empires of the Monsoon,A History of the Indian Ocean and Its Invaders,Richard Hall,"Africa, East",http://books.google.com/books/content?id=MuPEQ...,Until Vasco da Gama discovered the sea-route t...,1998.0,4.41,608.0,65.0
2,9780006470229,000647022X,The Gap Into Madness,Chaos and Order,Stephen R. Donaldson,"Hyland, Morn (Fictitious character)",http://books.google.com/books/content?id=4oXav...,A new-cover reissue of the fourth book in the ...,1994.0,4.15,743.0,103.0
3,9780006499626,0006499627,Miss Marple,The Complete Short Stories,Agatha Christie,"Detective and mystery stories, English",http://books.google.com/books/content?id=a96qP...,"Miss Marple featured in 20 short stories, publ...",1997.0,4.2,359.0,6235.0
4,9780006551812,0006551815,'Tis,A Memoir,Frank McCourt,Ireland,http://books.google.com/books/content?id=Q3BhQ...,FROM THE PULIZER PRIZE-WINNING AUTHOR OF THE #...,2000.0,3.68,495.0,44179.0


In [5]:
features = ['title', 'subtitle', 'categories', 'description', 'authors']
features_df = df[features]

In [6]:
features_df.head()

Unnamed: 0,title,subtitle,categories,description,authors
0,Spider's Web,A Novel,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...,Charles Osborne;Agatha Christie
1,Empires of the Monsoon,A History of the Indian Ocean and Its Invaders,"Africa, East",Until Vasco da Gama discovered the sea-route t...,Richard Hall
2,The Gap Into Madness,Chaos and Order,"Hyland, Morn (Fictitious character)",A new-cover reissue of the fourth book in the ...,Stephen R. Donaldson
3,Miss Marple,The Complete Short Stories,"Detective and mystery stories, English","Miss Marple featured in 20 short stories, publ...",Agatha Christie
4,'Tis,A Memoir,Ireland,FROM THE PULIZER PRIZE-WINNING AUTHOR OF THE #...,Frank McCourt


In [7]:
training = int(0.8*len(features_df))

In [8]:
total = len(features_df)

In [9]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

In [28]:
# Using https://www.kdnuggets.com/building-a-recommendation-system-with-hugging-face-transformers as a guide here

def mean_pooling(model_output, attention_mask):
    #First element of model_output contains all token embeddings
    token_embeddings = model_output[0] 
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

def get_embeddings(sentences):
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

  with torch.no_grad():
      model_output = model(**encoded_input)
  # Call mean_pooling function here:
  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

  sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

  return sentence_embeddings

In [30]:
# an example of using the 'get_embeddings' above - not necessary for this model, but useful to demonstrate the encoding process
sentences = ['I like dogs', 'I want a funny book']
result = get_embeddings(sentences)
print('sentence embeddings: ')
print(result)

sentence embeddings: 
tensor([[-5.8488e-02, -2.7953e-02,  6.8848e-02,  2.8501e-02, -6.7969e-02,
         -2.2579e-03,  7.2192e-02, -5.4668e-03,  1.0457e-01,  5.8393e-02,
          7.2705e-02, -6.5134e-02,  9.1378e-03,  2.3148e-02,  2.5676e-02,
          2.2404e-02, -1.9384e-02,  8.2517e-03, -3.3385e-02, -1.2674e-02,
         -1.7506e-01,  3.8567e-02,  1.6474e-02, -9.6142e-03, -1.2623e-01,
          4.4970e-02,  1.0760e-02, -7.9295e-02,  2.1948e-02, -2.5054e-03,
         -4.7563e-02,  2.0120e-02, -4.4740e-02,  1.9109e-02, -2.9493e-02,
         -4.5663e-03,  2.1108e-02, -5.0625e-03,  4.0325e-02,  3.6174e-02,
          7.4400e-03, -2.6705e-02,  7.7776e-02, -3.7348e-02, -7.9166e-02,
         -2.6028e-02, -6.1670e-02, -7.8588e-02,  8.8175e-02,  5.0165e-02,
          1.1242e-01,  5.4163e-02, -2.8548e-02,  1.5299e-03, -1.0703e-03,
         -3.3236e-02, -6.5248e-02,  6.4316e-02,  1.6829e-02, -8.3196e-02,
          6.3724e-02,  7.8348e-02,  3.5520e-02, -1.2284e-03,  5.2314e-02,
          1.5692

In [31]:
# documentation for the model used in the final book recommendation system: 
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2?language=python

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Adding multiple things to query on
title_embeddings = model.encode(features_df['title'].tolist())
subtitle_embeddings = model.encode(features_df['subtitle'].tolist())
authors_embeddings = model.encode(features_df['authors'].tolist())
description_embeddings = model.encode(features_df['description'].tolist())
categories_embeddings = model.encode(features_df['categories'].tolist())

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
def get_recommendations(query, embeddings, df, top_n=5):
    query_embedding = model.encode([query])
    similarities = cosine_similarity(query_embedding, embeddings)
    top_indices = similarities[0].argsort()[-top_n:][::-1]
    return df.iloc[top_indices]

In [21]:
query = "Book about dragons"
# query on title
recommendations = get_recommendations(query, title_embeddings, features_df)
print(recommendations[['title', 'description']])

                       title  \
594      The Dragons of Eden   
162        Realms of Dragons   
189         American Dragons   
1413             Dragonology   
916   The Eyes of the Dragon   

                                            description  
594   The well-known astronomer and astrobiologist s...  
162   In the tradition of "The Wheel of Time, " this...  
189   Includes short stories, poems, and excerpts fr...  
1413  Presents an introduction to dragonology that i...  
916   After Flagg, the evil court magician, sees a m...  


In [20]:
query = "Adventure story"
# query on book description
recommendations = get_recommendations(query, description_embeddings, features_df)
print(recommendations[['title', 'description', 'authors']])

                                                  title  \
1820  There's a (slight) Chance I Might be Going to ...   
733                                         Toys Go Out   
429                                 A Walk in the Woods   
5                              A Small Pinch of Weather   
248                                      Descent of Man   

                                            description               authors  
1820  The first novel from the "New York Times" best...         Laurie Notaro  
733   Six stories relate the adventures of three bes...         Emily Jenkins  
429   Traces the author's adventurous trek along the...           Bill Bryson  
5     A magical and fantastic collection of early st...            Joan Aiken  
248   A mad, hilarious collection of short stories, ...  T. Coraghessan Boyle  


In [18]:
df.iloc[1820]['description']

'The first novel from the "New York Times" bestselling author of "The Idiot Girls\' Action-Adventure Club," this is a rollicking tale of small-town peculiarity, dark secrets, and one extraordinary beauty pageant.'

In [24]:
query = "I like stories about bunnies"
# query on book description
recommendations = get_recommendations(query, description_embeddings, features_df)
print(recommendations[['title', 'description', 'authors']])

                                         title  \
500                James Herriot's Dog Stories   
751                         Shakespeare A to Z   
1863                         The Guy Not Taken   
531                           Jack of Kinrowan   
1817  The Secret Society of Demolition Writers   

                                            description  \
500   Complete collection of 50 dog stories by James...   
751   Brer Rabbit will never learn! He loves to play...   
1863  From the bestselling author of "Good in Bed" c...   
531   The two stories of Jacky Rowan are combined in...   
1817  Short stories capture the lives of such offbea...   

                              authors  
500                     James Herriot  
751   Charles Boyce;David Allen White  
1863                  Jennifer Weiner  
531                   Charles de Lint  
1817         Aimee Bender;Marc Parent  


In [25]:
df.iloc[1817].description

'Short stories capture the lives of such offbeat characters as a delusional schizophrenic, an egg donor with second thoughts, and a young girl who discovers a portal to another, ghostly world.'