In [2]:
## Inbuilt imports
import pathlib # creates single var
from langchain_community.document_loaders import UnstructuredFileLoader , UnstructuredWordDocumentLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from constant import blog

   
class DocumentReader_test:
    
    def __init__(self):
        # Initialising text splitter
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        self.path = ""          
              
    # get the file extension
    def get_file_extension(self):
        return pathlib.Path(self.path).suffix
    
    # returns loaded document
    def get_document(self):
        file_ext = self.get_file_extension()
        match file_ext:
            case '.pdf':
                loader = UnstructuredFileLoader(self.path)               
            case '.txt':
                loader = UnstructuredFileLoader(self.path)
            case '.docx':
                loader = UnstructuredWordDocumentLoader(self.path)
            case _:
                print('Format of the document is not supported')    
        return loader.load()
     
    # Splitting documents
    def split_documents(self):
        docs = self.get_document()
        return self.text_splitter.split_documents(docs)
        
            
    # creates vector embeddings and stores in vector store    
    def load_document(self,embeddings, path):
        self.path = path
        docs = self.split_documents()
        return Chroma.from_documents(documents=docs, embedding = embeddings)            
    
    # creates vector embeddings and stores in vector store    
    def load_document_from_disk(self,embeddings, path,dir):
        self.path = path
        docs = self.split_documents()
        blog(f"Totoal chunks of document ----> {len(docs)}")
        return Chroma.from_documents(documents=docs, embedding = embeddings,persist_directory =dir)      
    
          
        
         

In [3]:
embed_llm_path = r"D:\OneDrive - Adani\Desktop\LEARNING_FOLDER\_Kolkata_2024\1_LLM\local_downloaded_models\embedding_models\bge-base-en-v1.5"
embed_llm =HuggingFaceEmbeddings(
            model_name = embed_llm_path,
            show_progress = True,
            model_kwargs = {"trust_remote_code": True})
 
file_path = r"D:\OneDrive - Adani\Desktop\LEARNING_FOLDER\_Kolkata_2024\1_LLM\3_Text_query_bot\_docs\to_upload_docs\pdf_documents\Leave_Policy_2024.pdf" 
db_dir = r"D:\OneDrive - Adani\Desktop\LEARNING_FOLDER\_Kolkata_2024\1_LLM\3_Text_query_bot\app_final\vector_store_created"

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
doc_reader =DocumentReader_test()
db_1 = doc_reader.load_document_from_disk(embeddings=embed_llm,path=file_path,dir= db_dir)
db_1

⏲️  ⏲️  ⏲️   2024-07-31 13:00:49.940555 🟢 🟢 🟢 ----> Totoal chunks of document ----> 13




Batches: 100%|██████████| 1/1 [00:01<00:00,  1.50s/it]


<langchain_chroma.vectorstores.Chroma at 0x2480df1eb50>

In [41]:
db_dir = r"D:\OneDrive - Adani\Desktop\LEARNING_FOLDER\_Kolkata_2024\1_LLM\3_Text_query_bot\app_final\vector_store_created_3"
doc_reader =DocumentReader_test()
db_2 = doc_reader.load_document_from_disk(embeddings=embed_llm,path=file_path,dir= db_dir)
data_db_2 = db_2.get(include= ['metadatas'])

len(data_db_2["ids"])

⏲️  ⏲️  ⏲️   2024-07-31 13:21:55.840823 🟢 🟢 🟢 ----> Totoal chunks of document ----> 13




Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]


13

In [46]:
data_db_2["embeddings"]


In [40]:
data_db_2 = db_2.get(include= ['metadatas'])
db_2.similarity_search("What are types of leaves")
(data_db_2)

Batches: 100%|██████████| 1/1 [00:00<00:00, 17.71it/s]


{'ids': ['0428257f-a16a-449b-afab-b6eef7c3be4a',
  '08b13256-b612-4ac9-a7ed-337f7bdbc85a',
  '5a6595dc-c326-4d5b-a4d7-a149acd0438d',
  '60537b40-ec49-4d58-8c8b-3a9441404fea',
  '741e3bc8-9c44-4e42-aea1-6573e6b6e43d',
  '82fc49d0-72c0-48a1-af4a-80e5fd61b986',
  'b92f7e5a-95b0-4bbf-a85c-a34cca4d5b52',
  'bd74430f-19c6-45fe-a90f-24e60113e77a',
  'be446801-7092-407b-b28d-5660fbf32c41',
  'd60c0973-132d-4e68-ba82-4f263a8a7e3b',
  'e0d725ff-7695-4996-944c-11f874093102',
  'eceb0e2e-15eb-4da6-b1ac-be9044b59496',
  'fa1d740f-5448-4f0a-b57a-14d68a6fccda'],
 'embeddings': None,
 'metadatas': [{'source': 'D:\\OneDrive - Adani\\Desktop\\LEARNING_FOLDER\\_Kolkata_2024\\1_LLM\\3_Text_query_bot\\_docs\\to_upload_docs\\pdf_documents\\Leave_Policy_2024.pdf'},
  {'source': 'D:\\OneDrive - Adani\\Desktop\\LEARNING_FOLDER\\_Kolkata_2024\\1_LLM\\3_Text_query_bot\\_docs\\to_upload_docs\\pdf_documents\\Leave_Policy_2024.pdf'},
  {'source': 'D:\\OneDrive - Adani\\Desktop\\LEARNING_FOLDER\\_Kolkata_2024\\1_LLM

In [23]:
data_1 = db_1.get()
# db_1._collection.get('100691cb-acda-4889-80c6-643a8ade27e2')
# len(data_1['embeddings'])
## how do I know that it is the same document ??
## unique character ?? file name + file extension + filesize
## If all this is same then it is a duplicate document
## unique url -> "filenamewithextension+filesiz
db_1.similarity_search("What are types of leaves")
db_1.delete()

Batches: 100%|██████████| 1/1 [00:00<00:00, 19.59it/s]


[Document(page_content='Leave Policy\n\nVersion Issue Date\n\nHRLP24 1-Apr-2024\n\nEffective Date\n\n1-Jan-2024\n\nThis version supersedes all previous versions with a lesser version number\n\n1.0 OBJECTIVE: Leave is a period of rest and relaxation provided at intervals during\n\nwork periods. For convenience in terms of reference, it is divided into categories such as Privilege Leave (PL), Casual Leave (CL), Sick Leave (SL) and Special Leave\n\n2.0 DEFINITIONS:\n\n2.1 Leave Year: The Leave Year is the calendar year from 1 January to 31 December. All calculations of earning, credit, availment, balance etc. of leave will be done with respect to each Leave Year\n\n2.2 Entitlement: This refers to the amount, or the number of days of Leave (of different types) that an employee gets in a Leave Year as part of conditions of service, subject to certain conditions such as full attendance on full salary. For example, the entitlement of PL is 21 days in a Leave Year', metadata={'source': 'D:\\On

In [12]:
chroma_db = Chroma(persist_directory= r"D:\OneDrive - Adani\Desktop\LEARNING_FOLDER\_Kolkata_2024\1_LLM\3_Text_query_bot\app_final\chroma.db")
chroma_db._collection.get('id')

{'ids': [],
 'embeddings': None,
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

In [18]:
db_1.get()

{'ids': ['100691cb-acda-4889-80c6-643a8ade27e2',
  '10e32c82-9aeb-4046-9db2-65fa894e3966',
  '1ab99ab4-e4df-4ada-8549-a2fa3ec77fc7',
  '2be4ed4e-24b3-47fc-a7bd-97441f149693',
  '307cc0ed-75a6-4aba-95f4-9f39a608aa4d',
  '31358df8-b039-41d0-9f39-06f6d7d0a411',
  '3a7ea39c-c34f-4865-bb3f-16c382031092',
  '523033ba-5536-4045-b2dd-c89be5bf7106',
  '57892bba-2d27-46ea-a330-bd5f96e967c3',
  '5a3689ac-3749-40b3-866c-687c21907281',
  '64f1f719-e5c3-4d28-b57a-bfa34e42143e',
  '67a34c50-50b2-4de0-8f26-2156cacbb137',
  '6ff4afdb-957b-41b6-a1ba-81751d0e3cea',
  '7b37c772-ba75-48b0-b7a9-5cd23c2c03b2',
  '822a6ebf-6412-45a2-9189-018256f0bc08',
  'b4737569-fdba-4dbe-92fb-5d4cbf5fc8f4',
  'b7426bbf-ed6a-4b3c-ba70-d3775a1d652b',
  'ccbf95eb-8abc-40fb-be9b-37cdcecbeaf1',
  'ce9c7a6e-3e72-4782-b977-b7f51fbcdb76',
  'cf0de40b-eccb-424a-aa99-e06683bc6294',
  'd2d5fd7a-8ce0-4184-bfad-2df2fa65d57d',
  'ecfbdab6-c8c4-418f-a425-dbadd60a7126',
  'ed695600-d303-4d82-9049-9f87985230f1',
  'f1a026ca-2361-4436-b8dc-