In [8]:
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [56]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS

# Step 1: Load PDF Documents from a Directory
data = PyPDFDirectoryLoader('data2')
documents = data.load()

# Step 2: Initialize the Sentence Transformer Splitter
splitter = SentenceTransformersTokenTextSplitter(chunk_size=200, chunk_overlap=50)

# Step 3: Split the Documents into Chunks
docs1 = splitter.split_documents(documents)

# Step 4: Initialize the Embedder (assuming you're using SentenceTransformer for embeddings)
embedder = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 5: Embed the Split Documents
# Here, we'll assume docs1 is a list of Document objects with 'page_content' as their text attribute.
texts = [doc.page_content for doc in docs1]
embeddings = embedder.embed_documents(texts)

# Optionally, you can print or inspect the embeddings
print(f"Generated {len(embeddings)} embeddings.")


incorrect startxref pointer(3)


Generated 604 embeddings.


In [57]:
vector_store = FAISS.from_documents(docs1, embedder)
# Step 7: (Optional) Save FAISS index to disk for later use
faiss_index_file = "faiss_index.bin"
vector_store.save_local(faiss_index_file)

In [58]:
from sklearn.metrics.pairwise import cosine_similarity

In [44]:
query=" tell me about CPP/CDCI Funds"

In [59]:
result = vector_store.similarity_search(query)

In [60]:
s=""
s=result[0].page_content

In [61]:

model = SentenceTransformer("BAAI/bge-small-en-v1.5")
query_embedding = model.encode([query])
r_embedding=model.encode([s])
similarities = cosine_similarity(query_embedding,r_embedding)
print(similarities)

[[0.8101849]]


Recursive comaprison below 

In [62]:
data = PyPDFDirectoryLoader('data2')
documents = data.load()

# Step 2: Initialize the Sentence Transformer Splitter
splitter2 = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=50)

# Step 3: Split the Documents into Chunks
docs2 = splitter.split_documents(documents)

# Step 4: Initialize the Embedder (assuming you're using SentenceTransformer for embeddings)
embedder = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 5: Embed the Split Documents
# Here, we'll assume docs1 is a list of Document objects with 'page_content' as their text attribute.
texts2 = [doc.page_content for doc in docs2]
embeddings2 = embedder.embed_documents(texts)



incorrect startxref pointer(3)


In [63]:
vector_store2 = FAISS.from_documents(docs2, embedder)
# Step 7: (Optional) Save FAISS index to disk for later use
faiss_index_file = "faiss_index2.bin"
vector_store2.save_local(faiss_index_file)

In [64]:
s=""
result = vector_store2.similarity_search(query)
s=result[0].page_content
model = SentenceTransformer("BAAI/bge-small-en-v1.5")
query_embedding = model.encode([query])
r_embedding=model.encode([s])
similarities = cosine_similarity(query_embedding,r_embedding)
print(similarities)

[[0.8101849]]


Semantic chunker

In [65]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings

In [70]:
data = PyPDFDirectoryLoader('data2')
documents = data.load()


incorrect startxref pointer(3)


In [71]:
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


splitter = SemanticChunker(embedder)


In [72]:
docs3=splitter.split_documents(documents)


In [73]:
docs3

[Document(metadata={'source': 'data2/ZRWJZJQPWO5B3TIBCWQKW47LFR6BM3HW.pdf', 'page': 0}, page_content=''),
 Document(metadata={'source': 'data2/ZRWJZJQPWO5B3TIBCWQKW47LFR6BM3HW.pdf', 'page': 1}, page_content=''),
 Document(metadata={'source': 'data2/ZRWJZJQPWO5B3TIBCWQKW47LFR6BM3HW.pdf', 'page': 2}, page_content=''),
 Document(metadata={'source': 'data2/ZC2ELDSYWFVOJZRXTLERXVM7UMXWCWZN.pdf', 'page': 0}, page_content=''),
 Document(metadata={'source': 'data2/ZAIRYEDJ6SHNYMBVPJMCBULE2HL7HMRV.pdf', 'page': 0}, page_content="Review and Evaluation of Clinical Data \n \nNDA (Serial Number) 20823 (SE1-016) \nSponsor: Novartis \nDrug: Exelon® (rivastigmine tartrate)  \nProposed Indication: Dementia Associated With Parkinson's Disease  \nMaterial Submitted: Supplemental New Drug Application \nCorrespondence Date: 8/31/05 \nDate Received / Agency: 9/1/05 \nDate Review Completed 4/18/06 \nReviewer: Ranjit B. Mani, M.D."),
 Document(metadata={'source': 'data2/ZAIRYEDJ6SHNYMBVPJMCBULE2HL7HMRV.pdf', 

In [74]:
vector_store3 = FAISS.from_documents(docs3, embedder)
# Step 7: (Optional) Save FAISS index to disk for later use
faiss_index_file = "faiss_index3.bin"
vector_store3.save_local(faiss_index_file)

In [75]:
result = vector_store3.similarity_search(query)
s=result[0].page_content
model = SentenceTransformer("BAAI/bge-small-en-v1.5")
query_embedding = model.encode([query])
r_embedding=model.encode([s])
similarities = cosine_similarity(query_embedding,r_embedding)
print(similarities)

[[0.81943655]]


Combination of sentence transformer and 

In [51]:
data = PyPDFDirectoryLoader('data2')
documents = data.load()

# Step 2: Initialize the Sentence Transformer Splitter
splitter4 = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=50)
splitter5=SentenceTransformersTokenTextSplitter(chunk_size=200, chunk_overlap=50)
# Step 3: Split the Documents into Chunks
docs4= splitter4.split_documents(documents)
docs4=splitter5.split_documents(docs4)

# Step 4: Initialize the Embedder (assuming you're using SentenceTransformer for embeddings)
embedder = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 5: Embed the Split Documents
# Here, we'll assume docs1 is a list of Document objects with 'page_content' as their text attribute.



incorrect startxref pointer(3)


In [52]:
vector_store4 = FAISS.from_documents(docs4, embedder)
# Step 7: (Optional) Save FAISS index to disk for later use
faiss_index_file = "faiss_index4.bin"
vector_store4.save_local(faiss_index_file)

In [54]:
query=" tell me about CPP/CDCI Funds"

In [77]:
result = vector_store4.similarity_search(query)
s=result[0].page_content
model = SentenceTransformer("BAAI/bge-small-en-v1.5")
query_embedding = model.encode([query])
r_embedding=model.encode([s])
similarities = cosine_similarity(query_embedding,r_embedding)
print(similarities)

[[0.83277655]]


Recursive + semantic 

In [76]:

data = PyPDFDirectoryLoader('data2')
documents = data.load()
embed = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 2: Initialize the Sentence Transformer Splitter
splitter6 = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=50)
splitter7=SemanticChunker(embed)
# Step 3: Split the Documents into Chunks
docs5= splitter6.split_documents(documents)
docs5=splitter7.split_documents(docs5)

# Step 4: Initialize the Embedder (assuming you're using SentenceTransformer for embeddings)
embedder = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 5: Embed the Split Documents
# Here, we'll assume docs1 is a list of Document objects with 'page_content' as their text attribute.



incorrect startxref pointer(3)


In [78]:
vector_store5 = FAISS.from_documents(docs4, embedder)
# Step 7: (Optional) Save FAISS index to disk for later use
faiss_index_file = "faiss_index4.bin"
vector_store5.save_local(faiss_index_file)

In [79]:
result = vector_store5.similarity_search(query)
s=result[0].page_content
model = SentenceTransformer("BAAI/bge-small-en-v1.5")
query_embedding = model.encode([query])
r_embedding=model.encode([s])
similarities = cosine_similarity(query_embedding,r_embedding)
print(similarities)

[[0.83277655]]


For checking or selecting the best chunk size for recursive character text splitter we can use llm or to decide the best and have to implement the best combination with semnatic chunker and recursive character text splitter 


mixtral-8x7b-32768