<b> Notebook with code example of using embedding to perform semantic search on a collection of books. What can you improve in each step to make the search more accurate and effecient? </b>

In [6]:
import os
import re
import pickle
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

<b> First, we load the books, do some simple preprocessing </b>

In [4]:
# Define the directory containing the text files
data_dir = '../fasttext/data_2/'

# Function to load all text files into a dictionary
def load_books(data_dir):
    books = {}
    for filename in os.listdir(data_dir):
        if filename.endswith(".txt"):
            with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as f:
                text = f.read()
                books[filename] = text
    return books

# Load the books
books = load_books(data_dir)
print(f"Number of books: {len(books)}")

# Basic text preprocessing
def preprocess_text(text):
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

# Preprocess each book
processed_books = {filename: preprocess_text(text) for filename, text in books.items()}

# Display the start of only 3 books to verify loading
for i, (book, text) in enumerate(processed_books.items()):
    if i >= 3:  # Only display 3 books
        break
    print(f"Book: {book[:30]}, Content Sample: {text[:200]} \n")

Number of books: 5689
Book: book_1115.txt, Content Sample: 






the first part of king henry the fourth


by william shakespeare



dramatis personae

  king henry the fourth
  henry prince of wales son to the king
  prince john of lancaster son to the king 

Book: book_876.txt, Content Sample: 









life in the ironmills

by rebecca harding davis


               is this the end
               o life as futile then as frail
               what hope of answer or redress


a cloudy day do 

Book: book_2153.txt, Content Sample: additional proof reading by joseph e loewenstein md




mary barton

by elizabeth gaskell




contents

i       a mysterious disappearance
ii      a manchester teaparty
iii     john bartons great trou 



<b> Next, using sentence transformer and numpy, we do the embedding on the books and store the embedding </b>

In [10]:
# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to embed each book
def embed_books(books):
    embeddings = {}
    for filename, text in books.items():
        # Generate embedding for the entire book (you can also split into smaller parts if desired)
        embeddings[filename] = model.encode(text)
    return embeddings

# Path to save embeddings
embedding_file = 'book_embeddings.pkl'

# Check if embeddings already exist to avoid recomputing
if os.path.exists(embedding_file):
    # Load embeddings from disk
    with open(embedding_file, 'rb') as f:
        book_embeddings = pickle.load(f)
    print("Loaded embeddings from disk.")
else:
    # Compute embeddings and save them
    book_embeddings = embed_books(processed_books)
    with open(embedding_file, 'wb') as f:
        pickle.dump(book_embeddings, f)
    print("Computed and saved embeddings.")

# Example: Show embedding shape for one book
for book, embedding in list(book_embeddings.items())[:3]:  # Display for the first 3 books
    print(f"Book: {book}, Embedding shape: {embedding.shape}")



Loaded embeddings from disk.
Book: book_1115.txt, Embedding shape: (384,)
Book: book_876.txt, Embedding shape: (384,)
Book: book_2153.txt, Embedding shape: (384,)


<b> Next we take the user input, embed it in the same vector space and calcualte the cosine similarity to the nearest books </b>

In [9]:
# Function to find the top 5 most relevant books based on a user query
def find_top_n_books(query, book_embeddings, n=5):
    # Embed the query
    query_embedding = model.encode(query)

    # Compute cosine similarity between query and each book embedding
    similarities = {}
    for book, embedding in book_embeddings.items():
        # Compute cosine similarity
        similarity = cosine_similarity([query_embedding], [embedding])[0][0]
        similarities[book] = similarity

    # Sort books by similarity score in descending order and get the top n
    sorted_books = sorted(similarities.items(), key=lambda item: item[1], reverse=True)[:n]

    return sorted_books

# Function to print the first 20 lines of a book with a visual barrier
def print_top_books(book_list, processed_books):
    for i, (book, score) in enumerate(book_list):
        # Get the first 20 lines of the book
        book_content = processed_books[book]
        first_20_lines = '\n'.join(book_content.splitlines()[:40])
        
        # Print the book details and the first 20 lines
        print(f"Book {i+1}: {book}")
        print(f"Similarity score: {score:.4f}")
        print("-" * 40)  # Visual separator
        print(f"{first_20_lines}")
        print("=" * 40)  # End of one book, separator for next

# Allow the user to input a query
user_query = input("Enter your search query: ")

# Find the top 5 most relevant books
top_5_books = find_top_n_books(user_query, book_embeddings, n=5)

# Print the top 5 books with the first 20 lines of each
print_top_books(top_5_books, processed_books)


Enter your search query:  I am looking for a book about switzerland in general


Book 1: book_3836.txt
Similarity score: 0.5093
----------------------------------------










swiss family robinson by johann david wyss


this edition c
by
pink tree press
po box 
salt lake city utah 
isbn 





portions of this header are copyright c  by michael s hart
and may be reprinted only when these etexts are free of all fees
project gutenberg is a trademark and may not be used in any sales
of project gutenberg etexts or other materials be they hardware or
software or any other related product without express permission





this gutenberg edition of the swiss family robinson is a gift
from the editors cut imprint of pink tree press
wwwpinktreepresscom all editors cuttm editions are
free except for handling charges necessary to provide
the book in your preferred format

Book 2: book_1367.txt
Similarity score: 0.4854
----------------------------------------





findelkind

by louise de la ramee aka ouida


works of louisa de la ramee ouida

     findelkind
     muriella
  