Book recommendations via semantic search based on book title + book description



1.   Create Index
2.   Book Embeddings
3.   Upload to Index
4.   Search Query Embedding
5.   Query Index



In [None]:
!pip install sentence-transformers
!pip install pinecone-client

In [None]:
import pandas as pd
# books file from https://www.kaggle.com/datasets/dylanjcastillo/7k-books-with-metadata

df = pd.read_csv('/content/books.csv',
                   encoding='latin-1',
                   sep=',',
                   on_bad_lines='skip')



## Create Index

In [3]:
from pinecone import Pinecone, ServerlessSpec
from google.colab import userdata

In [4]:
pc = Pinecone(api_key=userdata.get('pinecone_api_key'))

In [5]:
index_name = 'books-index'

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [8]:
dimensions = 384 # Huggingface Sentence Transformers translates text to 384 dimensions
n_vectors = len(df) #6810
pc.create_index(index_name,
                dimension=dimensions,
                metric='cosine',
                spec=ServerlessSpec(
                  cloud='aws',
                  region='us-east-1'))

## Book Embeddings

In [None]:
df.head(3)
df.shape

(6810, 400)

In [None]:
# prepare vectors by concatenating title + substitle + description
df['text'] = df['title'].fillna('') + ' ' +  df['subtitle'].fillna('') + ' ' + df['description'].fillna('')
df['metadata'] = df.apply(lambda row: {key: row[key] for key in ['authors', 'categories', 'title'] if not pd.isna(row[key])}, axis=1)


In [None]:
# embedding
embedding_vectors = model.encode(df['text'].tolist(), show_progress_bar=True) #took 7 minutes
# persist for later use
#import numpy as np
#np.save('embedding_vectors_books.npy', embedding_vectors)
#embedding_vectors = np.load('embedding_vectors_books.npy')

In [None]:
df['embeddings'] = list(embedding_vectors)

In [None]:
df.rename(columns = {"isbn13": "id", "embeddings": "values"},  inplace=True)

In [None]:
df_uploads = df[['id', 'values', 'metadata']]
df_uploads['id'].astype(str)

## Upload to Vector store

In [10]:
index = pc.Index(index_name)

In [None]:
index.upsert_from_dataframe(df_uploads, show_progress=True)

In [11]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 6810}},
 'total_vector_count': 6810}

## Search



In [25]:
# Type what you are searching for
query_text = 'Greek mythology for kids'
query_embedding =  model.encode([query_text]).tolist()
index.query(vector=query_embedding, top_k=5, include_metadata=True)

{'matches': [{'id': '9780500251218',
              'metadata': {'authors': 'Richard Buxton',
                           'title': 'Greek Mythology'},
              'score': 0.805668414,
              'values': []},
             {'id': '9780517588376',
              'metadata': {'authors': 'William F. Russell',
                           'categories': 'Fiction',
                           'title': 'Classic Myths to Read Aloud: The Great '
                                    'Stories of Greek and Roman Mythology, '
                                    'Specially Arranged for Children Five and '
                                    'Up by an Educational Expert'},
              'score': 0.712172031,
              'values': []},
             {'id': '9780689868832',
              'metadata': {'authors': 'Padraic Colum',
                           'categories': 'Juvenile Fiction',
                           'title': "The Children's Homer"},
              'score': 0.642288387,
              'valu

In [32]:
# Type what you are searching for
query_text = 'something about love and romance'
query_embedding =  model.encode([query_text]).tolist()
index.query(vector=query_embedding,
            top_k=3,
            include_metadata=True,
            filter={'categories': 'Poetry'})

{'matches': [{'id': '9780231109253',
              'metadata': {'authors': 'Emma Donoghue',
                           'categories': 'Poetry',
                           'title': 'Poems Between Women'},
              'score': 0.56277597,
              'values': []},
             {'id': '9780786881482',
              'metadata': {'authors': 'Pablo Neruda',
                           'categories': 'Poetry',
                           'title': 'Love'},
              'score': 0.410627633,
              'values': []},
             {'id': '9780292760288',
              'metadata': {'authors': 'Pablo Neruda',
                           'categories': 'Poetry',
                           'title': '100 Love Sonnets'},
              'score': 0.394990772,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}

In [35]:
# Type what you are searching for
query_text1 = 'how to learn painting'
query_embedding =  model.encode([query_text1]).tolist()
index.query(vector=query_embedding,
            top_k=3,
            include_metadata=True)

{'matches': [{'id': '9780395530078',
              'metadata': {'authors': 'Kimon NicolaÃ¯des',
                           'categories': 'Art',
                           'title': 'The Natural Way to Draw'},
              'score': 0.551862061,
              'values': []},
             {'id': '9780823006571',
              'metadata': {'authors': 'Juliette Aristides',
                           'categories': 'Art',
                           'title': 'The Classical Drawing Atelier'},
              'score': 0.541911244,
              'values': []},
             {'id': '9780140446425',
              'metadata': {'authors': 'Leo Tolstoy;Richard Pevear;Larissa '
                                      'Volokhonsky',
                           'categories': 'Art',
                           'title': 'What is Art?'},
              'score': 0.534654677,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}

In [38]:
# Type what you are searching for
query_text1 = 'important software engineering principles'
query_embedding =  model.encode([query_text1]).tolist()
index.query(vector=query_embedding,
            top_k=2,
            include_metadata=True)

{'matches': [{'id': '9781590593899',
              'metadata': {'authors': 'Avram Joel Spolsky',
                           'categories': 'Computers',
                           'title': 'Joel on Software'},
              'score': 0.437090784,
              'values': []},
             {'id': '9780132370455',
              'metadata': {'authors': 'Frank M. Carrano',
                           'categories': 'Computers',
                           'title': 'Data Structures and Abstractions with '
                                    'Java'},
              'score': 0.414372027,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}