Book recommendations via semantic search based on book title + book description



1.   Create Index
2.   Books Embedding
3.   Upload to Index
4.   Search Query Embedding
5.   Query Index



In [1]:
!pip install sentence-transformers
!pip install pinecone-client

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [None]:
import pandas as pd
# books file from https://www.kaggle.com/datasets/dylanjcastillo/7k-books-with-metadata

df = pd.read_csv('/content/books.csv',
                   encoding='latin-1',
                   sep=',',
                   on_bad_lines='skip')



## Create Index

In [3]:
from pinecone import Pinecone, ServerlessSpec
from google.colab import userdata

In [4]:
pc = Pinecone(api_key=userdata.get('pinecone_api_key'))

In [5]:
index_name = 'books-index'

In [6]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [8]:
dimensions = 384 # Huggingface Sentence Transformers translates text to 384 dimensions
n_vectors = len(df) #6810
pc.create_index(index_name,
                dimension=dimensions,
                metric='cosine',
                spec=ServerlessSpec(
                  cloud='aws',
                  region='us-east-1'))

## Books Embedding

In [None]:
df.head(3)
df.shape

(6810, 400)

In [None]:
# prepare vectors by concatenating title + substitle + description
df['text'] = df['title'].fillna('') + ' ' +  df['subtitle'].fillna('') + ' ' + df['description'].fillna('')
df['metadata'] = df.apply(lambda row: {key: row[key] for key in ['authors', 'categories', 'title'] if not pd.isna(row[key])}, axis=1)


In [None]:
# embedding
embedding_vectors = model.encode(df['text'].tolist(), show_progress_bar=True) #took 7 minutes

Batches:   0%|          | 0/213 [00:00<?, ?it/s]

In [None]:
# persist for later use
#import numpy as np
#np.save('embedding_vectors_books.npy', embedding_vectors)
#embedding_vectors = np.load('embedding_vectors_books.npy')

In [None]:
df['embeddings'] = list(embedding_vectors)

In [None]:
df.rename(columns = {"isbn13": "id", "embeddings": "values"},  inplace=True)

In [None]:
df_uploads = df[['id', 'values', 'metadata']]
df_uploads['id'].astype(str)

## Upload to Vector store

In [10]:
index = pc.Index(index_name)

In [None]:
index.upsert_from_dataframe(df_uploads, show_progress=True)

sending upsert requests:   0%|          | 0/6810 [00:00<?, ?it/s]

{'upserted_count': 6810}

In [11]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 6810}},
 'total_vector_count': 6810}

## Search



In [21]:
# Type what you are searching for
query_text = 'Greek mythology with pathos'
query_embedding =  model.encode([query_text]).tolist()
index.query(vector=query_embedding, top_k=5, include_metadata=True)

{'matches': [{'id': '9780500251218',
              'metadata': {'authors': 'Richard Buxton',
                           'title': 'Greek Mythology'},
              'score': 0.771424353,
              'values': []},
             {'id': '9780226307862',
              'metadata': {'authors': 'David Grene;Richmond Lattimore',
                           'title': 'The Complete Greek Tragedies'},
              'score': 0.561708152,
              'values': []},
             {'id': '9780226307640',
              'metadata': {'authors': 'David Grene;Richmond Lattimore',
                           'title': 'Complete Greek Tragedies'},
              'score': 0.548072338,
              'values': []},
             {'id': '9780517588376',
              'metadata': {'authors': 'William F. Russell',
                           'categories': 'Fiction',
                           'title': 'Classic Myths to Read Aloud: The Great '
                                    'Stories of Greek and Roman Mythology, '


In [18]:
# Type what you are searching for
query_text1 = 'something about love and romance and warm feelings'
query_embedding =  model.encode([query_text1]).tolist()
index.query(vector=query_embedding,
            top_k=5,
            include_metadata=True)

{'matches': [{'id': '9781400078479',
              'metadata': {'categories': 'Fiction', 'title': 'Love'},
              'score': 0.570192873,
              'values': []},
             {'id': '9780679744474',
              'metadata': {'authors': 'Jeanette Winterson',
                           'categories': 'Fiction',
                           'title': 'Written on the Body'},
              'score': 0.567765653,
              'values': []},
             {'id': '9780156003650',
              'metadata': {'authors': 'Octavio Paz',
                           'categories': 'Literary Collections',
                           'title': 'The Double Flame'},
              'score': 0.561916828,
              'values': []},
             {'id': '9780231109253',
              'metadata': {'authors': 'Emma Donoghue',
                           'categories': 'Poetry',
                           'title': 'Poems Between Women'},
              'score': 0.558703363,
              'values': []},
         

In [20]:
# Type what you are searching for
query_text1 = 'how to prepare for retirement'
query_embedding =  model.encode([query_text1]).tolist()
index.query(vector=query_embedding,
            top_k=5,
            include_metadata=True)

{'matches': [{'id': '9780887309427',
              'metadata': {'authors': 'Stephen Pollan;Mark Levine',
                           'categories': 'Business & Economics',
                           'title': 'Die Broke'},
              'score': 0.49231711,
              'values': []},
             {'id': '9780195023664',
              'metadata': {'authors': 'David Hackett Fischer',
                           'categories': 'Religion',
                           'title': 'Growing Old in America'},
              'score': 0.45059675,
              'values': []},
             {'id': '9780470038321',
              'metadata': {'authors': 'Eric Tyson',
                           'categories': 'Business & Economics',
                           'title': 'Personal Finance For Dummies'},
              'score': 0.431669533,
              'values': []},
             {'id': '9781932450750',
              'metadata': {'authors': 'Lynnette Khalfani',
                           'categories': 'Business &

In [22]:
# Type what you are searching for
query_text1 = 'software development'
query_embedding =  model.encode([query_text1]).tolist()
index.query(vector=query_embedding,
            top_k=5,
            include_metadata=True)

{'matches': [{'id': '9781590596241',
              'metadata': {'authors': 'Christopher Duncan',
                           'categories': 'Computers',
                           'title': 'The Career Programmer'},
              'score': 0.512146056,
              'values': []},
             {'id': '9781590593899',
              'metadata': {'authors': 'Avram Joel Spolsky',
                           'categories': 'Computers',
                           'title': 'Joel on Software'},
              'score': 0.480638027,
              'values': []},
             {'id': '9780131492028',
              'metadata': {'authors': 'Walter J. Savitch',
                           'categories': 'Computers',
                           'title': 'Java'},
              'score': 0.437425703,
              'values': []},
             {'id': '9780977616633',
              'metadata': {'authors': 'David Thomas;David Heinemeier '
                                      'Hansson;Leon Breedt',
                    