In [1]:
from langchain_community.document_loaders import TextLoader #text loader that takes text and converts it into a form lang chain can work with
from langchain_text_splitters import CharacterTextSplitter #splits whole document into meaningful chunks(individual descriptions)
from langchain_openai import OpenAIEmbeddings #document embeddings
from langchain_chroma import Chroma #vector data base taht we can use

In [2]:
#sets up environment to use open ai model
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
import pandas as pd

books = pd.read_csv("books_cleaned.csv")

In [4]:
books


Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine..."
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5192,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,Mistaken Identity,9788172235222 On A Train Journey Home To North...
5193,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,Journey to the East,9788173031014 This book tells the tale of a ma...
5194,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,The Monk Who Sold His Ferrari: A Fable About F...,9788179921623 Wisdom to Create a Life of Passi...
5195,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535 This collection of the timeless ...


In [7]:
books["tagged_description"]

0       9780002005883 A NOVEL THAT READERS and critics...
1       9780002261982 A new 'Christie for Christmas' -...
2       9780006178736 A memorable, mesmerizing heroine...
3       9780006280897 Lewis' work on the nature of lov...
4       9780006280934 "In The Problem of Pain, C.S. Le...
                              ...                        
5192    9788172235222 On A Train Journey Home To North...
5193    9788173031014 This book tells the tale of a ma...
5194    9788179921623 Wisdom to Create a Life of Passi...
5195    9788185300535 This collection of the timeless ...
5196    9789027712059 Since the three volume edition o...
Name: tagged_description, Length: 5197, dtype: object

In [8]:
#saves tagged descriptions in seperate txt file
books["tagged_description"].to_csv("tagged_description.txt", sep="\n", index=False, header=False)

In [5]:
longest_line = ""
longest_length = 0

with open("tagged_description.txt", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if len(line) > longest_length:
            longest_line = line
            longest_length = len(line)

print("Longest line length:", longest_length)
print("Longest line content:\n", longest_line)


Longest line length: 5836
Longest line content:
 "9780687002825 Life at the end of the twentieth century presents us with a disturbing reality. Otherness, the simple fact of being different in some way, has come to be defined as in and of itself evil. Miroslav Volf contends that if the healing word of the gospel is to be heard today, Christian theology must find ways of speaking that address the hatred of the other. Reaching back to the New Testament metaphor of salvation as reconciliation, Volf proposes the idea of embrace as a theological response to the problem of exclusion. Increasingly we see that exclusion has become the primary sin, skewing our perceptions of reality and causing us to react out of fear and anger to all those who are not within our (ever-narrowing) circle. In light of this, Christians must learn that salvation comes, not only as we are reconciled to God, and not only as we ""learn to live with one another,"" but as we take the dangerous and costly step of opening

In [8]:
raw_documents = TextLoader("tagged_description.txt", encoding="utf-8").load()
#chunk overlap prevents the chunks from overlapping and the seperator is if its a new line
#chunk size is 0 because it looks for closest seperator relative to the index number indicated by chunks, ensuring it splits on a newline everytime rather than a chunk which could be multiple
text_splitter = CharacterTextSplitter(chunk_size=1, chunk_overlap=0, separator="\n") #chunk size basically checks if there is more seperators in the chunk size and breaks them up, super high means it wont seperate unless the block is small enough to allow seperation
documents = text_splitter.split_documents(raw_documents)

MAX = 6000  # chunk size
with open("tagged_description.txt", encoding="utf-8") as f:
    for i, line in enumerate(f, start=1):
        if len(line) > MAX:
            print(f"Line {i} is {len(line)} chars long → will be split")

Created a chunk of size 1168, which is longer than the specified 1
Created a chunk of size 1214, which is longer than the specified 1
Created a chunk of size 373, which is longer than the specified 1
Created a chunk of size 309, which is longer than the specified 1
Created a chunk of size 483, which is longer than the specified 1
Created a chunk of size 482, which is longer than the specified 1
Created a chunk of size 960, which is longer than the specified 1
Created a chunk of size 188, which is longer than the specified 1
Created a chunk of size 843, which is longer than the specified 1
Created a chunk of size 296, which is longer than the specified 1
Created a chunk of size 197, which is longer than the specified 1
Created a chunk of size 881, which is longer than the specified 1
Created a chunk of size 1088, which is longer than the specified 1
Created a chunk of size 1189, which is longer than the specified 1
Created a chunk of size 304, which is longer than the specified 1
Create

In [9]:
documents[0]

Document(metadata={'source': 'tagged_description.txt'}, page_content='9780002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gi

In [10]:
db_books = Chroma.from_documents(documents, embedding=OpenAIEmbeddings())

In [11]:
query = "A book to teach children about nature"
docs = db_books.similarity_search(query, k=10)
docs

[Document(id='554a47d0-697e-4e9a-99ed-1e5474a89665', metadata={'source': 'tagged_description.txt'}, page_content='9780786808069 Children will discover the exciting world of their own backyard in this introduction to familiar animals from cats and dogs to bugs and frogs. The combination of photographs, illustrations, and fun facts make this an accessible and delightful learning experience.'),
 Document(id='f4fbdd24-c845-417c-a464-1f94a72afa2b', metadata={'source': 'tagged_description.txt'}, page_content="9780786808380 Introduce your babies to birds, cats, dogs, and babies through fine art, illustration, and photographs. These books are a rare opportunity to expose little ones to a range of images on a single subject, from simple child's drawings and abstract art to playful photos. A brief text accompanies each image, introducing the baby to some basic -- and sometimes playful -- information about the subjects."),
 Document(id='dbcb9e68-190a-46ac-bd44-a10059a97c5a', metadata={'source': '

In [12]:
books[books["isbn13"] == int(docs[0].page_content.split()[0].strip())] #filters pandas frame just to first result
#we are taking all the recommentdations, take out the text, split based on space, and then take the isbn, and then convert to integer to get the field in the data frame

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
3747,9780786808069,786808063,Baby Einstein: Neighborhood Animals,Marilyn Singer;Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=X9a4P...,Children will discover the exciting world of t...,2001.0,3.89,16.0,180.0,Baby Einstein: Neighborhood Animals,9780786808069 Children will discover the excit...


In [13]:
def retrieve_semantic_recommendations(
        query: str,
        top_k: int = 10 #number of recommendations we want
) -> pd.DataFrame:
    recs = db_books.similarity_search(query, k=50) #code to give us the recommendations, the similarity search in the data base
    books_list = [] #contains isbns

    for i in range(0, len(recs)):#loop over recs returned and then we are extracting isbn to then add the contents of the row for the book
        books_list += [int(recs[i].page_content.strip('"').split()[0])] #change to take quotation marks as it causes issues with isbn
    return books[books["isbn13"].isin(books_list)].head(top_k)#filter books dataset based on isbn

In [14]:
retrieve_semantic_recommendations("A book to teach children about nature")

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
31,9780007105045,0007105045,Tree and Leaf,John Ronald Reuel Tolkien,Literary Collections,http://books.google.com/books/content?id=aPb_A...,"""The two works 'On fairy-stories' and 'Leaf by...",2001.0,4.09,176.0,2245.0,Tree and Leaf: The Homecoming of Beorhtnoth : ...,"9780007105045 ""The two works 'On fairy-stories..."
59,9780007151240,0007151241,The Family Way,Tony Parsons,Parenthood,http://books.google.com/books/content?id=dJEIx...,It should be the most natural thing in the wor...,2005.0,3.51,400.0,2095.0,The Family Way,9780007151240 It should be the most natural th...
143,9780060546571,0060546573,Three Rotten Eggs,Gregory Maguire,Juvenile Fiction,http://books.google.com/books/content?id=t2pWl...,The students of Miss Earth's class in rural Ve...,2005.0,3.74,240.0,76.0,Three Rotten Eggs,9780060546571 The students of Miss Earth's cla...
400,9780062700254,0062700251,Bulfinch's Mythology,Richard P. Martin,Reference,http://books.google.com/books/content?id=eev4u...,A beautiful gift edition of Thomas Bulfinch's ...,1991.0,4.1,768.0,64.0,"Bulfinch's Mythology: The Age of the Fable, Th...",9780062700254 A beautiful gift edition of Thom...
429,9780064434980,0064434982,The Deer in the Wood,Laura Ingalls Wilder,Juvenile Fiction,http://books.google.com/books/content?id=V7YDW...,Even the youngest child can enjoy a special ad...,1999.0,4.17,32.0,302.0,The Deer in the Wood,9780064434980 Even the youngest child can enjo...
442,9780067575208,006757520X,The Sense of Wonder,Rachel Carson,Nature,http://books.google.com/books/content?id=Zee5S...,"First published more than three decades ago, t...",1998.0,4.39,112.0,1160.0,The Sense of Wonder,9780067575208 First published more than three ...
692,9780140448009,0140448004,Three Tales,Gustave Flaubert;Roger Whitehouse;Geoffrey Wall,Fiction,http://books.google.com/books/content?id=XFzga...,Features short fiction by the French naturalis...,2005.0,3.71,110.0,3050.0,Three Tales,9780140448009 Features short fiction by the Fr...
707,9780140568196,0140568190,The Giraffe and the Pelly and Me,Roald Dahl;Quentin Blake,Candy,http://books.google.com/books/content?id=J7FdI...,"A Dahl story in which the giraffe, the pelican...",2001.0,3.81,32.0,16265.0,The Giraffe and the Pelly and Me,9780140568196 A Dahl story in which the giraff...
711,9780140621624,0140621628,The Railway Children,E. Nesbit,Fiction,http://books.google.com/books/content?id=fFesd...,"When their father is sent away to prison, thre...",1995.0,4.0,212.0,178.0,The Railway Children,9780140621624 When their father is sent away t...
763,9780141186078,0141186070,The Log from the Sea of Cortez,John Steinbeck,Biography & Autobiography,http://books.google.com/books/content?id=9CrIf...,This light-hearted journal tells of John Stein...,2001.0,3.84,288.0,3226.0,The Log from the Sea of Cortez,9780141186078 This light-hearted journal tells...


In [15]:
retrieve_semantic_recommendations("A fantasy book")

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
348,9780061052392,61052396,Realms of Dragons,Margaret Weis;Denise Little;Tracy Hickman,Fiction,http://books.google.com/books/content?id=ieAFA...,"In the tradition of ""The Wheel of Time, "" this...",1999.0,4.04,218.0,47.0,Realms of Dragons: The Worlds of Weis and Hickman,"9780061052392 In the tradition of ""The Wheel o..."
358,9780061094156,61094153,Imajica II,Clive Barker,Fiction,http://books.google.com/books/content?id=DZVKS...,The magical tale of ill-fated lovers lost amon...,1995.0,4.42,544.0,2538.0,Imajica II: The Reconciliation,9780061094156 The magical tale of ill-fated lo...
560,9780140110876,140110879,You Bright and Risen Angels,William T. Vollmann,Fiction,http://books.google.com/books/content?id=d0buA...,This comic and surreal novel about the beastli...,1988.0,4.08,635.0,747.0,You Bright and Risen Angels: A Cartoon,9780140110876 This comic and surreal novel abo...
713,9780140714555,140714553,A Midsummer Night's Dream,William Shakespeare,Drama,http://books.google.com/books/content?id=P0Ceh...,"Two pairs of star-crossed lovers, a feuding pa...",2000.0,3.94,144.0,984.0,A Midsummer Night's Dream,9780140714555 Two pairs of star-crossed lovers...
768,9780141187198,141187190,Seven Gothic Tales,Isak Dinesen,Fiction,http://books.google.com/books/content?id=4uKAn...,"A selection of stories about romantics, advent...",2002.0,3.94,368.0,82.0,Seven Gothic Tales,9780141187198 A selection of stories about rom...
775,9780141310688,141310685,Westmark,Lloyd Alexander,Juvenile Fiction,http://books.google.com/books/content?id=v5U-P...,"Theo, a boy fleeing from criminal charges, fal...",2002.0,3.92,184.0,4049.0,Westmark,"9780141310688 Theo, a boy fleeing from crimina..."
779,9780141311883,141311886,The Blue Sword,Robin McKinley,,http://books.google.com/books/content?id=nDW2H...,"Harry, bored with her sheltered life in the re...",2001.0,4.23,272.0,286.0,The Blue Sword,"9780141311883 Harry, bored with her sheltered ..."
821,9780142407226,142407224,The Tough Guide to Fantasyland,Diana Wynne Jones,Juvenile Nonfiction,http://books.google.com/books/content?id=v5jxA...,A unique guide to fantasy literature helps rea...,2006.0,3.94,234.0,3897.0,The Tough Guide to Fantasyland,9780142407226 A unique guide to fantasy litera...
874,9780143039938,143039938,The Book of Imaginary Beings,Jorge Luis Borges;Margarita Guerrero;Andrew Hu...,Fiction,http://books.google.com/books/content?id=FuNQP...,A whimsical compendium of mythological creatur...,2006.0,4.09,236.0,4809.0,The Book of Imaginary Beings,9780143039938 A whimsical compendium of mythol...
1445,9780345331052,345331052,The Doom that Came to Sarnath,Howard Phillips Lovecraft,Fiction,http://books.google.com/books/content?id=McpvP...,In a city of gems and riches - beyond the drea...,1971.0,4.12,208.0,2734.0,The Doom that Came to Sarnath,9780345331052 In a city of gems and riches - b...
