Importing dependencies

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import JinaEmbeddings
from langchain_chroma import Chroma
import os

Loading environment valriables

In [18]:
from dotenv import load_dotenv
load_dotenv()

True

In [44]:
import pandas as pd
df = pd.read_csv('../data/cleaned_data_v2.csv')

In [45]:
df.head()

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,missing_description,age_of_book,words_in_description,title_and_subtitle,tagged_description
0,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,0,32.0,57,Rage of angels,"9780006178736 A memorable, mesmerizing heroine..."
1,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,0,23.0,45,The Four Loves,9780006280897 Lewis' work on the nature of lov...
2,9780006280934,6280935,The Problem of Pain,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,0,23.0,75,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le..."
3,9780006380832,6380832,Empires of the Monsoon,A History of the Indian Ocean and Its Invaders,Richard Hall,"Africa, East",http://books.google.com/books/content?id=MuPEQ...,Until Vasco da Gama discovered the sea-route t...,1998.0,4.41,608.0,65.0,0,27.0,80,Empires of the Monsoon: A History of the India...,9780006380832 Until Vasco da Gama discovered t...
4,9780006472612,6472613,Master of the Game,,Sidney Sheldon,Adventure stories,http://books.google.com/books/content?id=TkTYp...,Kate Blackwell is an enigma and one of the mos...,1982.0,4.11,489.0,43540.0,0,43.0,30,Master of the Game,9780006472612 Kate Blackwell is an enigma and ...


To use the TextLoader method we need to convert the pandas dataframe to text, we also need to remove the index and the header.

In [46]:
df['tagged_description'].to_csv("../text_files/tagged_description.txt",
                                sep='\n',
                                index=False,
                                header=False)

Creating an instance of Jina doccument embedder

In [47]:
embeddings = JinaEmbeddings(
    jina_api_key=os.getenv('JINA_API_KEY'),
    model_name="jina-embeddings-v2-base-en"
)

First we need to load the text

In [52]:
raw_documents = TextLoader('../text_files/tagged_description.txt',encoding='utf-8').load()
text_splitter = CharacterTextSplitter(chunk_size=1000,chunk_overlap=0,separator="\n")
documents = text_splitter.split_documents(raw_documents)

Note: 
* by setting chunk_size to 1000 (arbitrarly high value), we make sure that the split is done by the seperator
* chunk_overlap = 0 is set because we don't need the descriptions to overlap.

In [53]:
documents[0]

Document(metadata={'source': '../text_files/tagged_description.txt'}, page_content="9780006178736 A memorable, mesmerizing heroine Jennifer -- brilliant, beautiful, an attorney on the way up until the Mafia's schemes win her the hatred of an implacable enemy -- and a love more destructive than hate. A dangerous, dramatic world The Dark Arena of organized crime and flashbulb lit courtrooms where ambitious prosecutors begin their climb to political power.\n9780006280897 Lewis' work on the nature of love divides love into four categories; Affection, Friendship, Eros and Charity. The first three come naturally to humanity. Charity, however, the Gift-love of God, is divine, and without this supernatural love, the natural loves become distorted and even dangerous.")

Now we will embed the documents and store them in Chroma db

In [55]:
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    persist_directory="../chroma_db/chroma_db_jina"  
)

In [59]:
query = "a book to teach children about nature"
docs = vectorstore.similarity_search(query,k=10)
docs

[Document(id='22f145d8-b02c-4e4a-a191-b76ce1cfa7dc', metadata={'source': '../text_files/tagged_description.txt'}, page_content="9780451456816 A collection of stories that detail modern science's effect on humankind's notions of conception, birth, and parenting includes contributions from Michael Armstrong, Robert Silverberg, Susan Palwick, and others\n9780451457592 Fifteen-year-old Ariella spends her days in the forest caring for the animals, but when her father dies and she is threatened with marriage to a cousin she has never seen, a remarkable horselike creature from the woods will reward the girl's compassion with a special gift.\n9780451461636 After Weather Warden Joanne Baldwin prevents Mother Earth from destroying the planet, she struggles to recover her identity after losing her memories at the hands of a vengeful jinn.\n9780451527400 The oldest extant poem in a modern European language chronicles a feudal newly Christianized world still populated by the monsters and demons of 

To get the isbn13 of the first document recommended

In [64]:
int(docs[0].page_content.split()[0].strip())

9780451456816

This gives us the book metadata from the dataframe, that we can show to the user.

In [65]:
df[df['isbn13'] == int(docs[0].page_content.split()[0].strip())]

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,missing_description,age_of_book,words_in_description,title_and_subtitle,tagged_description
1780,9780451456816,451456815,Not of Woman Born,,Constance Ash;Robert A. Silverberg,Fiction,http://books.google.com/books/content?id=nOEIA...,A collection of stories that detail modern sci...,1999.0,3.8,288.0,59.0,0,26.0,28,Not of Woman Born,9780451456816 A collection of stories that det...


In [66]:
def retrieve_semantic_recommendation(query:str,top_k: int=10)-> pd.DataFrame:
    records = vectorstore.similarity_search(query,k=50)
    books_list = []
    
    for i in range(0,len(records)):
        books_list+=[int(records[i]
                                .page_content
                                .strip('"')
                                .split()[0]
                        )]
    return df[df['isbn13'].isin(books_list)].head(top_k)
        

In [68]:
retrieve_semantic_recommendation('A book to teach children about nature')

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,missing_description,age_of_book,words_in_description,title_and_subtitle,tagged_description
56,9780029221303,0029221307,The Origins of the Civil Rights Movement,,Aldon D. Morris,History,http://books.google.com/books/content?id=7vyHY...,A blending of scholarly research and interview...,1986.0,4.04,368.0,145.0,0,39.0,32,The Origins of the Civil Rights Movement,9780029221303 A blending of scholarly research...
61,9780060005696,0060005696,The Paradox of Choice,Why More Is Less,Barry Schwartz,Business & Economics,,The author of The Battle for Human Nature expl...,2005.0,3.84,265.0,23734.0,0,20.0,54,The Paradox of Choice: Why More Is Less,9780060005696 The author of The Battle for Hum...
118,9780060775858,0060775858,Goodnight Moon 60th Anniversary Edition,,Margaret Wise Brown,Juvenile Fiction,http://books.google.com/books/content?id=lLYOr...,"In a great green room, tucked away in bed, is ...",2005.0,4.27,32.0,264013.0,0,20.0,97,Goodnight Moon 60th Anniversary Edition,"9780060775858 In a great green room, tucked aw..."
169,9780060957353,0060957352,The Darling,A Novel,Russell Banks,Fiction,http://books.google.com/books/content?id=NlC2Q...,Set in Liberia and the United States from 1975...,2005.0,3.78,400.0,1939.0,0,20.0,76,The Darling: A Novel,9780060957353 Set in Liberia and the United St...
202,9780061238239,0061238236,The End of Days,Armageddon and Prophecies of the Return,Zecharia Sitchin,History,http://books.google.com/books/content?id=EIBlj...,A conclusion to the Earth Chronicles series br...,2007.0,4.06,336.0,470.0,0,18.0,38,The End of Days: Armageddon and Prophecies of ...,9780061238239 A conclusion to the Earth Chroni...
210,9780064403870,0064403874,"R-T, Margaret, and the Rats of NIMH",,Jane Leslie Conly,Juvenile Fiction,http://books.google.com/books/content?id=WTHHH...,"When Margaret and her younger brother, Artie, ...",1991.0,3.52,272.0,631.0,0,34.0,95,"R-T, Margaret, and the Rats of NIMH",9780064403870 When Margaret and her younger br...
215,9780064405850,0064405850,Strawberry Girl 60th Anniversary Edition,,Lois Lenski,Juvenile Fiction,http://books.google.com/books/content?id=AQXM2...,"The land was theirs, but so were its hardships...",1995.0,3.86,208.0,10655.0,0,30.0,94,Strawberry Girl 60th Anniversary Edition,"9780064405850 The land was theirs, but so were..."
418,9780141015088,014101508X,How to Breathe Underwater,Stories,Julie Orringer,Families,http://books.google.com/books/content?id=t1v9V...,In her dazzling first book Julie Orringer dive...,2005.0,3.96,222.0,188.0,0,20.0,97,How to Breathe Underwater: Stories,9780141015088 In her dazzling first book Julie...
475,9780142003343,0142003344,The Blank Slate,The Modern Denial of Human Nature,Steven Pinker,Psychology,http://books.google.com/books/content?id=7rJ5g...,In a study of the nature versus nurture debate...,2003.0,4.08,528.0,17851.0,0,22.0,29,The Blank Slate: The Modern Denial of Human Na...,9780142003343 In a study of the nature versus ...
484,9780142300701,0142300705,A Year Down Yonder,,,Juvenile Fiction,http://books.google.com/books/content?id=D_Vju...,"During the recession of 1937, fifteen-year-old...",2002.0,4.11,160.0,25534.0,0,23.0,39,A Year Down Yonder,"9780142300701 During the recession of 1937, fi..."
