In [None]:
import os
import pickle
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from dotenv import load_dotenv

load_dotenv()
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")

# Set up embeddings (use KaLM model)
# embeddings = HuggingFaceEmbeddings(model_name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5")

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# File paths for ChromaDB storage
DATASET_PATH = "../rguktBasarDataset"
TRAINED_DATA_PATH = "trainedData"
CHROMA_DB_PATH = os.path.expanduser("~/chroma_db")
METADATA_FILE = os.path.join(TRAINED_DATA_PATH, "chroma_metadata.pkl")

os.makedirs(TRAINED_DATA_PATH, exist_ok=True)

def load_pdfs_from_subfolders(base_path):
    """Loads all PDFs from subdirectories."""
    all_docs = []
    for root, _, files in os.walk(base_path):
        for file in files:
            if file.lower().endswith(".pdf"):
                file_path = os.path.join(root, file)
                try:
                    loader = PyPDFLoader(file_path)
                    docs = loader.load()
                    if docs:
                        all_docs.extend(docs)
                        print(f"Loaded {len(docs)} pages from: {file_path}")
                    else:
                        print(f"Warning: No text extracted from {file_path}")
                except Exception as e:
                    print(f"Error loading {file_path}: {e}")
    return all_docs

def create_and_save_vector_database():
    print("Training model and creating vector database...")

    # Load all PDFs from dataset directory (including subfolders)
    docs = load_pdfs_from_subfolders(DATASET_PATH)
    
    if not docs:
        print("Error: No documents found! Check dataset path and PDF contents.")
        return

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=500)
    final_documents = text_splitter.split_documents(docs)
    
    if not final_documents:
        print("Error: No valid text chunks generated for embeddings.")
        return

    # Create Chroma vector database
    try:
        vector_store = Chroma.from_documents(final_documents, embeddings, persist_directory=CHROMA_DB_PATH)
    except Exception as e:
        print(f"Error creating vector database: {e}")
        return

    # Save metadata
    with open(METADATA_FILE, "wb") as f:
        pickle.dump(final_documents, f)

    print("Vector database created and saved to disk.")

if __name__ == "__main__":
    create_and_save_vector_database()
    print("Vector database is ready for use.")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Training model and creating vector database...
Loaded 5 pages from: ../rguktBasarDataset\about_rgukt\about_rgukt.pdf


Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 34 0 (offset 0)
Ignoring wrong pointing object 35 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 27 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 31 0 (offset 0)
Ignoring wrong pointing object 32 0 (offset 0)


Loaded 7 pages from: ../rguktBasarDataset\about_rgukt\campusFacilities.pdf
Loaded 4 pages from: ../rguktBasarDataset\about_rgukt\campusLife.pdf
Loaded 6 pages from: ../rguktBasarDataset\acadamic\academicSection.pdf


Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)


Loaded 1021 pages from: ../rguktBasarDataset\acadamic\Academic_Regulations_Hand_Book.pdf
Loaded 4 pages from: ../rguktBasarDataset\acadamic\administrativeSection.pdf


Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)


Loaded 3 pages from: ../rguktBasarDataset\departments\bioScienceDept.pdf
Loaded 4 pages from: ../rguktBasarDataset\departments\chemicalDept.pdf


Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 25 0 (offset 0)


Loaded 3 pages from: ../rguktBasarDataset\departments\chemistryDept.pdf
Loaded 5 pages from: ../rguktBasarDataset\departments\civilDept.pdf


Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)


Loaded 2 pages from: ../rguktBasarDataset\departments\civil_dept_info.pdf
Loaded 5 pages from: ../rguktBasarDataset\departments\cse.pdf
Loaded 3 pages from: ../rguktBasarDataset\departments\cseDept.pdf


Ignoring wrong pointing object 19 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)


Loaded 4 pages from: ../rguktBasarDataset\departments\ece.pdf
Loaded 3 pages from: ../rguktBasarDataset\departments\eceDept.pdf


Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)


Loaded 3 pages from: ../rguktBasarDataset\departments\electricalDept.pdf
Loaded 3 pages from: ../rguktBasarDataset\departments\materialsAndMetallurigicalDept.pdf
Loaded 3 pages from: ../rguktBasarDataset\departments\mechanicalDept.pdf


Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)


Loaded 2 pages from: ../rguktBasarDataset\other_files\civil_dept_info.pdf
Loaded 2 pages from: ../rguktBasarDataset\other_files\CONFERENCE ROOM_ece.pdf
Loaded 2 pages from: ../rguktBasarDataset\other_files\csepeo.pdf
Loaded 2 pages from: ../rguktBasarDataset\other_files\DEPT LIBRARY_ece.pdf
Loaded 2 pages from: ../rguktBasarDataset\other_files\ece_info.pdf


Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)


Loaded 3 pages from: ../rguktBasarDataset\other_files\ECE_PEO_PO.pdf
Loaded 2 pages from: ../rguktBasarDataset\other_files\eee_Committees_in_the_Department.pdf
Loaded 2 pages from: ../rguktBasarDataset\other_files\eee_info.pdf
Loaded 2 pages from: ../rguktBasarDataset\other_files\eee_peo_po.pdf
Loaded 3 pages from: ../rguktBasarDataset\other_files\eee_RESEARCH_PUBLICATIONS.pdf


Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)


Loaded 2 pages from: ../rguktBasarDataset\other_files\FACULTYPUBLICATIONS.pdf
Loaded 2 pages from: ../rguktBasarDataset\other_files\Faculty_Publications_mme.pdf
Loaded 1 pages from: ../rguktBasarDataset\other_files\humanities.pdf


Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)


Loaded 5 pages from: ../rguktBasarDataset\other_files\labs_ece.pdf
Loaded 6 pages from: ../rguktBasarDataset\other_files\Lab_Details.pdf
Loaded 1 pages from: ../rguktBasarDataset\other_files\Management.pdf
Loaded 2 pages from: ../rguktBasarDataset\other_files\ME_Library.pdf
Loaded 4 pages from: ../rguktBasarDataset\other_files\me_peo_po.pdf


Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)


Loaded 4 pages from: ../rguktBasarDataset\other_files\PEO.pdf
Loaded 1 pages from: ../rguktBasarDataset\other_files\physics.pdf
Loaded 2 pages from: ../rguktBasarDataset\other_files\RESEARCH AND DEVELOPMENT LABS_ece.pdf
Loaded 2 pages from: ../rguktBasarDataset\other_files\SEMINAR HALL_ece.pdf


Ignoring wrong pointing object 19 0 (offset 0)
Ignoring wrong pointing object 45 0 (offset 0)
Ignoring wrong pointing object 46 0 (offset 0)


Loaded 6 pages from: ../rguktBasarDataset\scholarship\SCHOLARSHIP SECTION.pdf
Loaded 11 pages from: ../rguktBasarDataset\tnp\trainingAndPlacements.pdf
