<a href="https://colab.research.google.com/github/chibuikeeugene/nlp_projects/blob/main/semantic_book_recommender_search_app/research/vector_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders.text import TextLoader
import pandas as pd

In [4]:
# Corrected URL to access raw data
data = pd.read_csv('./books_cleaned.csv')
data.head()

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_subtitle,tagged_desc
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web:A Novel,9780002261982 A new 'Christie for Christmas' -...
2,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine..."
3,9780006280897,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...
4,9780006280934,6280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le..."


In [5]:
# select a random 1000 rows due to memory constraints
df = data.sample(n=1000, random_state=1)
print(data.shape, df.shape)

(5197, 13) (1000, 13)


In [6]:
# converting the feature column of interests that is - tagged description to a text document
df['tagged_desc'].to_csv('tagged_desc.txt', sep='\n', index=False, header=False)

In [7]:
# loading the text document
raw_documents =  TextLoader('tagged_desc.txt').load()

# splitting the documents into seperate chunks
text_splitter = CharacterTextSplitter(separator='\n',chunk_size = 0, chunk_overlap = 0)

documents = text_splitter.split_documents(raw_documents)

# view a part of the document
documents[0]

Created a chunk of size 235, which is longer than the specified 0
Created a chunk of size 212, which is longer than the specified 0
Created a chunk of size 201, which is longer than the specified 0
Created a chunk of size 230, which is longer than the specified 0
Created a chunk of size 182, which is longer than the specified 0
Created a chunk of size 1280, which is longer than the specified 0
Created a chunk of size 619, which is longer than the specified 0
Created a chunk of size 187, which is longer than the specified 0
Created a chunk of size 809, which is longer than the specified 0
Created a chunk of size 684, which is longer than the specified 0
Created a chunk of size 516, which is longer than the specified 0
Created a chunk of size 177, which is longer than the specified 0
Created a chunk of size 236, which is longer than the specified 0
Created a chunk of size 266, which is longer than the specified 0
Created a chunk of size 483, which is longer than the specified 0
Created a

Document(metadata={'source': 'tagged_desc.txt'}, page_content='9780312852535 A classic science fiction novel features humanoids spreading throughout the galaxy, threatening to stifle all human endeavor, and the hidden group of rebels who try to stem the humanoid tide, if it is not already too late')

In [8]:
# save document into a vector database locally
db_books = Chroma.from_documents(
    documents, 
    collection_name= 'text_collection',
    persist_directory= './text_embed_dir', 
    embedding = OllamaEmbeddings(model="llama3.1"))

In [9]:
# let's test our embedded vector with a query
query =  'a book to teach children about nature'
docs = db_books.similarity_search(query, k =4)
docs

[Document(id='93b9dd7c-8e00-4254-96bc-b6c7e06e8b2c', metadata={'source': 'tagged_desc.txt'}, page_content="9780312151225 Details how to use the ancient Chinese book of divination and discussess how to relate the I Ching's hexagrams to the problems of modern life"),
 Document(id='fc0681b0-a3a4-4e23-b671-0ac0efb403ae', metadata={'source': 'tagged_desc.txt'}, page_content='9780156006248 Living the innocent life of a proofreader, Raimundo Silva changes a key word in a history text that leads him to romance and alters the course of European history'),
 Document(id='c8872b86-dd77-4c19-bb6a-1b251364ff7d', metadata={'source': 'tagged_desc.txt'}, page_content='9780674961876 Argues that contemporary critics force works of literature to fit their theories and examines the impact of Marxism, linguistics, psychoanalysis, and structuralism on literary criticism'),
 Document(id='1504907e-9b21-475e-a50b-1362c952673c', metadata={'source': 'tagged_desc.txt'}, page_content='9780312852535 A classic scienc

In [10]:
type(docs[0])

langchain_core.documents.base.Document

In [11]:
docs[0].page_content

"9780312151225 Details how to use the ancient Chinese book of divination and discussess how to relate the I Ching's hexagrams to the problems of modern life"

In [12]:
# fetching the raw record for the book recommended using its isbn
df[df['isbn13'] == int(docs[0].page_content.split()[0].strip())]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_subtitle,tagged_desc
1177,9780312151225,312151225,The Everyday I Ching,Sarah Dening,"Body, Mind & Spirit",http://books.google.com/books/content?id=i5j0Z...,Details how to use the ancient Chinese book of...,1997.0,4.19,224.0,23.0,The Everyday I Ching,9780312151225 Details how to use the ancient C...


In [58]:
# creating a function to fetch recommendations based on semantics

def retrieve_semantic_recommendations(query:str, top_k: int = 10) -> pd.DataFrame:
    """
    fecth recommendations for a given query and return the top k

    args:
    query: str - the query for which we want to search similar items for based on semantics

    top_k: int - the total number of result items to be return ranked based on similarity score


    return type:
    a pandas dataframe object
    """
    results = db_books.similarity_search(query, k=20)

    book_isbn_lists = [] # to hold all books

    for _ in range(0, len(results)):
        book_isbn_lists.append(int(results[_].page_content.split()[0].strip('"')))
    
    df[df['isbn13'].isin(book_isbn_lists)].head(top_k)

    return df[['title', 'authors', 'categories', 'thumbnail', 'description', 'published_year']].head(top_k)

In [None]:
# calling the function
query = 'children and language'
num_of_books =  5

result = retrieve_semantic_recommendations(query=query, top_k=num_of_books)
result

Unnamed: 0,title,authors,categories,thumbnail,description,published_year
1271,The Humanoids,Jack Williamson,Fiction,http://books.google.com/books/content?id=vPSl0...,A classic science fiction novel features human...,1996.0
4338,Chronicle of a Death Foretold,Gabriel Garcia Marquez,Fiction,http://books.google.com/books/content?id=c7g6i...,The Nobel laureate weaves a story of a fantast...,2003.0
2506,The Birth of Tragedy,Friedrich Wilhelm Nietzsche,Philosophy,http://books.google.com/books/content?id=5pzdC...,Explores the origins of Greek tragedy and the ...,1995.0
3672,Mr. Maybe,Jane Green,Fiction,http://books.google.com/books/content?id=ZOfHh...,"Libby Mason, who dreams of marrying a wealthy ...",2002.0
1916,Strangers on a Train,Patricia Highsmith,Fiction,http://books.google.com/books/content?id=qbkfH...,Guy Haines loses his own identity after he is ...,2001.0
