In [None]:
import pandas as pd

cleaned_df = pd.read_csv('../artifacts/datasets/books_cleaned.csv')
cleaned_df.head()

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

In [None]:
raw_docs = TextLoader('../artifacts/datasets/tagged_descriptions.txt').load()
text_splitter = CharacterTextSplitter(chunk_size=1,chunk_overlap=0,separator="\n")

In [None]:
docs = text_splitter.split_documents(raw_docs)

### Loading the model

In [None]:
import os

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
embedds = model.encode([doc.page_content for doc in docs],
                        normalize_embeddings=True,
                        show_progress_bar=True
                    )
pd.DataFrame(embedds).to_csv('../artifacts/datasets/embeddings.csv',index=False)

In [None]:
df = pd.read_csv('../artifacts/datasets/embeddings.csv')
embds = df.values
embds.shape

In [None]:
book_db = Chroma(
    collection_name="books_collection",
    embedding_function=None,
)

book_db._collection.add(
    embeddings=embds,
    documents=[doc.page_content for doc in docs],
    ids=[f"doc_{i}" for i in range(len(docs))]
)

In [None]:
query = 'A book to children about nature'
query_vec = model.encode([query], normalize_embeddings=True)[0]

In [None]:
hits = book_db._collection.query(
    query_embeddings=[query_vec],
    n_results=5
)
hits['documents']

In [None]:
def get_recommendations(query, book_db = book_db, model = model, n_results=5):
    query_vec = model.encode([query],normalize_embeddings=True)[0]
    recommendations = book_db._collection.query(
        query_embeddings=[query_vec],
        n_results=n_results
    )
    ids = list(int(i[1:14]) for i in recommendations['documents'][0])
    return cleaned_df[cleaned_df['isbn13'].isin(ids)]

In [None]:
top_books = get_recommendations("Heal from a breakup")
top_books['description'].values