In [52]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

In [53]:
from dotenv import load_dotenv

load_dotenv()   #python-dotenv package to load environment variables 
                #from a .env file into your Python environment.

True

In [54]:
import pandas as pd

#we simply load the dataset previously we cleaned it from raw data
books = pd.read_csv('books_cleaned.csv')
books.head(3)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_subtitle,tagged_description
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883: A NOVEL THAT READERS and critic...
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982: A new 'Christie for Christmas' ...
2,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736: A memorable, mesmerizing heroin..."


In [55]:
books['tagged_description'].to_csv('tagged_description.txt',
                                  sep = '\n',
                                  index = False,
                                  header= False)

In [56]:
raw_documents = TextLoader("tagged_description.txt",encoding="utf-8").load()
                                    #encoding="utf-8" means:
                                    #“Read the file in a language that can handle almost every character — English, Urdu, emojis, etc.”

# Create a splitter — adjust chunk_size as needed
text_splitter = CharacterTextSplitter(
    chunk_size=400, # When cutting the book, each page should have 500 characters (letters, spaces, symbols).”
    chunk_overlap=0, #“Don’t repeat any text between pages.”
    separator="\n"
)

# Split documents
documents = text_splitter.split_documents(raw_documents)


Created a chunk of size 1169, which is longer than the specified 400
Created a chunk of size 1215, which is longer than the specified 400
Created a chunk of size 484, which is longer than the specified 400
Created a chunk of size 483, which is longer than the specified 400
Created a chunk of size 961, which is longer than the specified 400
Created a chunk of size 844, which is longer than the specified 400
Created a chunk of size 882, which is longer than the specified 400
Created a chunk of size 1089, which is longer than the specified 400
Created a chunk of size 1190, which is longer than the specified 400
Created a chunk of size 514, which is longer than the specified 400
Created a chunk of size 753, which is longer than the specified 400
Created a chunk of size 729, which is longer than the specified 400
Created a chunk of size 722, which is longer than the specified 400
Created a chunk of size 474, which is longer than the specified 400
Created a chunk of size 1268, which is longe

In [57]:
documents[0]

Document(metadata={'source': 'tagged_description.txt'}, page_content='9780002005883: A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, G

In [58]:
from langchain_community.embeddings import HuggingFaceEmbeddings

# Local (free) embedding model from Hugging Face
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Store documents in Chroma vector database
db_books = Chroma.from_documents(
    documents,
    embedding=embedding_model
)


In [59]:
#simple test
query = "A book to teach children about nature"
docs = db_books.similarity_search(query, k=2)
docs

[Document(id='47c5961b-70ad-4f8e-a59e-094fa8ada709', metadata={'source': 'tagged_description.txt'}, page_content='9780786808069: Children will discover the exciting world of their own backyard in this introduction to familiar animals from cats and dogs to bugs and frogs. The combination of photographs, illustrations, and fun facts make this an accessible and delightful learning experience.'),
 Document(id='d38d904c-afd6-4a03-b98b-9d594e563fbb', metadata={'source': 'tagged_description.txt'}, page_content='9780786808069: Children will discover the exciting world of their own backyard in this introduction to familiar animals from cats and dogs to bugs and frogs. The combination of photographs, illustrations, and fun facts make this an accessible and delightful learning experience.')]

In [60]:
books[books['isbn13'] == int(docs[0].page_content.split()[0].strip().rstrip(':'))]


Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_subtitle,tagged_description
3747,9780786808069,786808063,Baby Einstein: Neighborhood Animals,Marilyn Singer;Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=X9a4P...,Children will discover the exciting world of t...,2001.0,3.89,16.0,180.0,Baby Einstein: Neighborhood Animals,9780786808069: Children will discover the exci...


Step-by-step breakdown
docs[0]

From a list called docs, take the first document.
(Urdu: docs list ka pehla document le rahe hain.)

.page_content

Get that document’s full text.
(Is document ka pura text.)

.split()

Split the text into words (by spaces/newlines).

Example: "9780306406157: A classic book" → ["9780306406157:", "A", "classic", "book"]
(Text ko lafzon ki list me todta hai.)

[0]

Take the first word from that list.

From the example: "9780306406157:"
(Pehla lafz uthaya.)

.strip()

Remove any spaces at the start/end (if any).

Example stays "9780306406157:"
(Aagay/peechay ki khali jaga hata deta hai.)

.rstrip(':')

Remove a trailing colon : at the end if present.

"9780306406157:" → "9780306406157"
(Akhir me agar colon ho to hata do.)

int( ... )

Convert that cleaned string to an integer number.

"9780306406157" → 9780306406157 (as an int)
(Ab lafz ko number me badal diya.)

books['isbn13'] == <that number>

Make a True/False mask: for each row, check if its isbn13 equals that number.

Result is something like: [False, True, False, ...]
(Har row ke liye check: kya isbn13 barabar hai?)

books[ <mask> ]

Use that mask to filter the DataFrame.

You get only the matching rows (the book with that ISBN).
(Sirf wahi rows bachi jahan condition True thi.)



In [65]:
def retrieve_sementic_recommendation(query, top_k=50):
    docs = db_books.similarity_search(query, top_k)
    book_list = []

    for i in range(len(docs)):
        isbn_str = docs[i].page_content.strip('"').split()[0]  # pehla word lo
        isbn_str = isbn_str.rstrip(':')  # colon hatao
        book_list.append(int(isbn_str))  # ab int me convert karo

    return books[books['isbn13'].isin(book_list)]


In [66]:
retrieve_sementic_recommendation("A book to teach children about nature")

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_subtitle,tagged_description
59,9780007151240,0007151241,The Family Way,Tony Parsons,Parenthood,http://books.google.com/books/content?id=dJEIx...,It should be the most natural thing in the wor...,2005.0,3.51,400.0,2095.0,The Family Way,9780007151240: It should be the most natural t...
324,9780060959036,0060959037,Prodigal Summer,Barbara Kingsolver,Fiction,http://books.google.com/books/content?id=06IwG...,Barbara Kingsolver's fifth novel is a hymn to ...,2001.0,4.0,444.0,85440.0,Prodigal Summer: A Novel,9780060959036: Barbara Kingsolver's fifth nove...
404,9780064402453,0064402452,Racso and the Rats of NIMH,Jane Leslie Conly,Juvenile Fiction,http://books.google.com/books/content?id=MgoNv...,"‘Racso, a brash and boastful little rodent, is...",1988.0,3.76,288.0,3231.0,Racso and the Rats of NIMH,"9780064402453: ‘Racso, a brash and boastful li..."
406,9780064403870,0064403874,"R-T, Margaret, and the Rats of NIMH",Jane Leslie Conly,Juvenile Fiction,http://books.google.com/books/content?id=WTHHH...,"When Margaret and her younger brother, Artie, ...",1991.0,3.52,272.0,631.0,"R-T, Margaret, and the Rats of NIMH",9780064403870: When Margaret and her younger b...
407,9780064404419,0064404412,The Rainbow People,Laurence Yep,Juvenile Fiction,http://books.google.com/books/content?id=5AHwq...,"""Culled from 69 stories collected in a [1930s]...",1992.0,3.75,208.0,202.0,The Rainbow People,"9780064404419: ""Culled from 69 stories collect..."
416,9780064406925,006440692X,Winter on the Farm,Laura Ingalls Wilder,Juvenile Fiction,http://books.google.com/books/content?id=IvlKH...,The Little House books tell the story of a lit...,1997.0,4.13,32.0,400.0,Winter on the Farm,9780064406925: The Little House books tell the...
429,9780064434980,0064434982,The Deer in the Wood,Laura Ingalls Wilder,Juvenile Fiction,http://books.google.com/books/content?id=V7YDW...,Even the youngest child can enjoy a special ad...,1999.0,4.17,32.0,302.0,The Deer in the Wood,9780064434980: Even the youngest child can enj...
991,9780192862099,019286209X,The Origins of Life,John Maynard Smith;Eörs Szathmáry,Science,http://books.google.com/books/content?id=nHDbB...,'I can recommend this book as a thoroughly int...,2000.0,4.11,192.0,41.0,The Origins of Life: From the Birth of Life to...,9780192862099: 'I can recommend this book as a...
1078,9780241003008,0241003008,The Very Hungry Caterpillar,Eric Carle,Babytime resource,http://books.google.com/books/content?id=DpGEQ...,Eric Carle's children's classic is the story o...,1994.0,4.29,26.0,340101.0,The Very Hungry Caterpillar,9780241003008: Eric Carle's children's classic...
1639,9780374422080,0374422087,Everything on a Waffle,Polly Horvath,Juvenile Fiction,http://books.google.com/books/content?id=NimVJ...,This Newbery Honor Book tells the story of 11 ...,2004.0,3.71,150.0,9631.0,Everything on a Waffle,9780374422080: This Newbery Honor Book tells t...
