In [2]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

In [4]:
from dotenv import load_dotenv
load_dotenv() # To import OpenAI API keys

True

In [5]:
import pandas as pd
import numpy as np

In [6]:
books_df = pd.read_csv("../data/cleaned_books.csv")

In [48]:
books_df[books_df["isbn10"]=="006757520X"]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_join_subtitle
442,9780067575208,006757520X,The Sense of Wonder,Rachel Carson,Nature,http://books.google.com/books/content?id=Zee5S...,"First published more than three decades ago, t...",1998.0,4.39,112.0,1160.0,The Sense of Wonder


TextLoader cannot deal with pandas dataframes so we will augment the isbn10 and description columns and then save that series as a text file separated by the \n delimiter and then load it using TextLoader

In [8]:
series_to_save = books_df[["isbn10", "description"]].astype(str).agg(" ".join, axis=1)

In [15]:
series_to_save.to_csv("../data/isbn_description.txt", header=False, index=False, lineterminator="\n")

In [16]:
raw_docs = TextLoader("../data/isbn_description.txt").load()

Now we need to split the documents

We use the CharacterTextSplitter with the chunk size set to 0 to prioritize splitting on the delimiter instead of prioritizing the chunk size

In [17]:
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
chunks = text_splitter.split_documents(raw_docs)

Created a chunk of size 1167, which is longer than the specified 0
Created a chunk of size 1213, which is longer than the specified 0
Created a chunk of size 372, which is longer than the specified 0
Created a chunk of size 308, which is longer than the specified 0
Created a chunk of size 480, which is longer than the specified 0
Created a chunk of size 481, which is longer than the specified 0
Created a chunk of size 959, which is longer than the specified 0
Created a chunk of size 185, which is longer than the specified 0
Created a chunk of size 842, which is longer than the specified 0
Created a chunk of size 293, which is longer than the specified 0
Created a chunk of size 194, which is longer than the specified 0
Created a chunk of size 878, which is longer than the specified 0
Created a chunk of size 1087, which is longer than the specified 0
Created a chunk of size 1188, which is longer than the specified 0
Created a chunk of size 303, which is longer than the specified 0
Create

Checking to see a chunk

In [18]:
chunks[0]

Document(metadata={'source': '../data/isbn_description.txt'}, page_content='"0002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details

In [20]:
chroma_db_books = Chroma.from_documents(chunks, embedding=OpenAIEmbeddings())

Now we check to see whether querying actually works or not

Chroma handles embedding the query with the same embedding model as used previosuly in constructing the database and then performs a cosine similarity search returning the top-k matches

In [21]:
query = "A book to teach children about nature"
relevant_results = chroma_db_books.similarity_search_with_score(query, k=10)

In [23]:
relevant_results

[(Document(id='73c1e319-41ac-45df-9afb-8d32a4dc982a', metadata={'source': '../data/isbn_description.txt'}, page_content='"0786808063 Children will discover the exciting world of their own backyard in this introduction to familiar animals from cats and dogs to bugs and frogs. The combination of photographs, illustrations, and fun facts make this an accessible and delightful learning experience."'),
  0.25380706787109375),
 (Document(id='af3805af-ec2d-4aa3-8721-f53806e77079', metadata={'source': '../data/isbn_description.txt'}, page_content='"078680839X Introduce your baby to birds, cats, dogs, and babies through fine art, illustration, and photographs. These books are a rare opportunity to exopse little ones to a range of images on a single subject, from simple child\'s drawings and abstract art to playful photos. A brief text accompanies each image, introducing baby to some basic -- and sometimes playful -- information about the subjects."'),
  0.2838596701622009),
 (Document(id='8f6da

Now we want to extract the isbn10 from the relevant docs and query the original books dataframe using the isbn10 values

In [49]:
isbns = [result[0].page_content.split()[0][1:].strip() for result in relevant_results] # Extract the ISBN
books_df[books_df["isbn10"].isin(isbns)]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_join_subtitle
442,9780067575208,006757520X,The Sense of Wonder,Rachel Carson,Nature,http://books.google.com/books/content?id=Zee5S...,"First published more than three decades ago, t...",1998.0,4.39,112.0,1160.0,The Sense of Wonder
3214,9780689861130,0689861133,"Moo, Baa, la la La!",Sandra Boynton,Animal sounds,http://books.google.com/books/content?id=Gz40A...,Children will love joining in and imitating th...,2004.0,4.2,14.0,28261.0,"Moo, Baa, la la La!"
3581,9780763620875,0763620874,Judy Moody Saves the World!,Megan McDonald,Juvenile Fiction,http://books.google.com/books/content?id=xDIRB...,When Judy Moody gets serious about protecting ...,2004.0,4.03,160.0,5883.0,Judy Moody Saves the World!
3747,9780786808069,0786808063,Baby Einstein: Neighborhood Animals,Marilyn Singer;Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=X9a4P...,Children will discover the exciting world of t...,2001.0,3.89,16.0,180.0,Baby Einstein: Neighborhood Animals
3748,9780786808373,0786808373,Baby Einstein: Birds,Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=0jxHP...,"Introducing your baby to birds, cats, dogs, an...",2002.0,3.78,20.0,9.0,Baby Einstein: Birds
3749,9780786808380,0786808381,Baby Einstein: Babies,Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=jv4NA...,"Introduce your babies to birds, cats, dogs, an...",2002.0,4.03,20.0,29.0,Baby Einstein: Babies
3750,9780786808397,078680839X,Baby Einstein: Dogs,Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=qut8t...,"Introduce your baby to birds, cats, dogs, and ...",2002.0,3.81,20.0,26.0,Baby Einstein: Dogs
3751,9780786808717,0786808713,Baby Einstein: What Does Violet See? Raindrops...,Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=95IIA...,A very special puddle sets Violet the mouse of...,2002.0,3.25,18.0,16.0,Baby Einstein: What Does Violet See? Raindrops...
3797,9780789458209,0789458209,Tree,David Burnie,Juvenile Nonfiction,http://books.google.com/books/content?id=Qwsqj...,Photographs and text explore the anatomy and l...,2000.0,4.07,64.0,5.0,Tree
4264,9780941807555,094180755X,The Little Big Book for God's Children,Lena Tabori;Alice Wong,Religion,http://books.google.com/books/content?id=s2PfT...,THE LITTLE BIG BOOK FOR GOD'S CHILDREN is a wo...,2001.0,4.88,352.0,8.0,The Little Big Book for God's Children


Convert all of that into a function

In [52]:
def retrieve_semantic_recommendations(query: str, k: int = 5) -> pd.DataFrame:
    '''
    Takes a query and performs similarity search to
    return the top k (default 5) relevant results
    '''
    relevant_results = chroma_db_books.similarity_search_with_score(query, k=10)
    isbns = [result[0].page_content.split()[0][1:].strip() for result in relevant_results] # Extract the ISBN
    return books_df[books_df["isbn10"].isin(isbns)]

In [53]:
query = "A book about preachers"
retrieve_semantic_recommendations(query)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_join_subtitle
286,9780060916510,60916516,Jonah's Gourd Vine,Zora Neale Hurston,Fiction,http://books.google.com/books/content?id=g8Krk...,"John Buddy Pearson, a young Black man who beco...",1990.0,4.0,229.0,1619.0,Jonah's Gourd Vine
302,9780060930530,60930535,The Poisonwood Bible,Barbara Kingsolver,Fiction,http://books.google.com/books/content?id=xl1mx...,"Nathan Price, a evangelical Baptist who in 195...",1999.0,4.05,546.0,5449.0,The Poisonwood Bible: A Novel
1638,9780374299194,374299196,The Discomfort Zone,Jonathan Franzen,Biography & Autobiography,http://books.google.com/books/content?id=Z2vOA...,The author describes growing up in a family of...,2006.0,3.39,195.0,4660.0,The Discomfort Zone: A Personal History
1677,9780375701887,375701885,The Amen Corner,James Baldwin,Drama,http://books.google.com/books/content?id=Ce93B...,The play centers on the dilemma of the congreg...,1998.0,3.9,112.0,427.0,The Amen Corner: A Play
3030,9780674022577,674022572,Rebecca's Revival,Jon F Sensbach,Biography & Autobiography,http://books.google.com/books/content?id=tcFAr...,"The remarkable story of Rebecca Protten, a Car...",2005.0,3.68,302.0,125.0,Rebecca's Revival
3989,9780812930344,812930347,Living Faith,Jimmy Carter,Religion,http://books.google.com/books/content?id=zJSS7...,The former president traces the role of his st...,1998.0,3.79,288.0,456.0,Living Faith
4196,9780883689448,883689448,God's Generals,Roberts Liardon,Biography & Autobiography,http://books.google.com/books/content?id=jzUDA...,Recapture God's glory with twelve compelling s...,2003.0,4.45,416.0,1396.0,God's Generals: Why They Succeeded and why Som...
4750,9781581342475,1581342470,The Hidden Smile of God,John Piper,Religion,http://books.google.com/books/content?id=kK0Ih...,"The author of ""Comforting God"" takes an inspir...",2001.0,4.33,175.0,636.0,The Hidden Smile of God: The Fruit of Afflicti...
