Documents

In [32]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
import fitz  # PyMuPDF
import chromadb
from sentence_transformers import SentenceTransformer  # Local model for embeddings

# Directory containing PDFs
Documents = os.path.join(os.getcwd(), "allDocs")

# Initialize Chroma DB client and create or get the collection
chroma_client = chromadb.Client()
collection_name = "Aibot"
if collection_name in [col.name for col in chroma_client.list_collections()]:
    collection = chroma_client.get_collection(name=collection_name)
else:
    collection = chroma_client.create_collection(name=collection_name)

# Initialize the embeddings model (local model)
embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')  

# Dictionaries to hold page content, chunks, images, and embeddings
page_text_dict = {}
chunked_text_dict = {}
images_dict = {}

# Iterate over all files in the folder
for pdf_file in os.listdir(Documents):  # List files in the directory
    if pdf_file.endswith(".pdf"):  # Check if the file is a PDF
        pdf_path = os.path.join(Documents, pdf_file)
        
        # Load PDF using PyPDFLoader for text extraction
        loader = PyPDFLoader(pdf_path)
        
        # Extract text from each page and store in a dictionary
        pages = []
        async for page in loader.alazy_load():
            pages.append(page)
        
        for i, page in enumerate(pages):
            page_number = i + 1
            page_content = page.page_content
            
            # Store the page content in the dictionary
            page_text_dict[f"Page {page_number} of {pdf_file}"] = page_content
            
            # Chunk the page content into words to avoid splitting words
            splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100, separator=" ")
            chunks = splitter.split_text(page_content)
            
            # Store the chunks in the chunked_text_dict
            chunked_text_dict[f"Page {page_number} of {pdf_file} Chunks"] = chunks
            
            # Generate embeddings for each chunk and store them in Chroma DB
            embeddings = embeddings_model.encode(chunks, show_progress_bar=True)
            for index, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
                unique_id = f"{pdf_file}_page_{page_number}_chunk_{index}"  # Create a unique ID
                collection.add(
                    ids=[unique_id],  # Add the unique ID here
                    documents=[chunk],
                    embeddings=[embedding.tolist()],  # Convert numpy array to list
                    metadatas=[{"source": pdf_file, "page": page_number}]
                )

        # Extract images from the PDF using PyMuPDF
        with fitz.open(pdf_path) as pdf_document:
            for i in range(len(pdf_document)):
                page = pdf_document[i]
                images = page.get_images(full=True)  # Get all images on the page
                for img_index, img in enumerate(images):
                    xref = img[0]
                    base_image = pdf_document.extract_image(xref)
                    image_bytes = base_image["image"]
                    
                    # Save image to file or process as needed
                    image_filename = f"{pdf_file}_page_{i + 1}_img_{img_index + 1}.png"
                    with open(os.path.join(Documents, image_filename), "wb") as image_file:
                        image_file.write(image_bytes)
                    
                    # Store image metadata in images_dict (optional)
                    images_dict.setdefault(f"Page {page_number} of {pdf_file} Images", []).append(image_filename)

        # Print the metadata and first page content (optional)
        if pages:
            print(f"{pages[0].metadata}\n")
            print(pages[0].page_content)

# Example: Access the text of a specific page or chunks
print(page_text_dict)
print(chunked_text_dict)
print(images_dict)  # Print the extracted images metadata


Batches: 100%|██████████| 1/1 [00:00<00:00,  6.80it/s]
Add of existing embedding ID: Fundamentals of Information Technology.pdf_page_1_chunk_0
Insert of existing embedding ID: Fundamentals of Information Technology.pdf_page_1_chunk_0
Add of existing embedding ID: Fundamentals of Information Technology.pdf_page_1_chunk_1
Insert of existing embedding ID: Fundamentals of Information Technology.pdf_page_1_chunk_1
Batches: 100%|██████████| 1/1 [00:00<00:00, 76.93it/s]
Add of existing embedding ID: Fundamentals of Information Technology.pdf_page_2_chunk_0
Insert of existing embedding ID: Fundamentals of Information Technology.pdf_page_2_chunk_0
Batches: 100%|██████████| 1/1 [00:00<00:00, 37.03it/s]
Add of existing embedding ID: Fundamentals of Information Technology.pdf_page_3_chunk_0
Insert of existing embedding ID: Fundamentals of Information Technology.pdf_page_3_chunk_0
Batches: 100%|██████████| 1/1 [00:00<00:00, 24.28it/s]
Add of existing embedding ID: Fundamentals of Information Techno

{'source': 'c:\\Users\\JK\\Documents\\Terna-Chatbot\\Johnson\\oldfiles\\allDocs\\Fundamentals of Information Technology.pdf', 'page': 0}

Univ ersity of South Florida Univ ersity of South Florida 
Digital Commons @ Univ ersity of Digital Commons @ Univ ersity of 
South Florida South Florida 
FUND AMEN TALS OF INFORM ATION 
TECHNOL OGY: T extbook – English The Modernization of Digital Information 
Technology 
2023 
Fundamentals of Information T echnology Fundamentals of Information T echnology 
Shambha vi Ro y 
Clint on Daniel 
Univ ersity of South Florida , cedanie2@usf.edu 
Manish Agr awal 
Univ ersity of South Florida , magr awal@usf.edu 
Follow this and additional works at: https:/ /digitalcommons.usf.edu/dit_tb_eng 
Scholar Commons Citation Scholar Commons Citation 
Roy, Shambha vi; Daniel, Clint on; and Agr awal, Manish, "F undamentals of Information T echnology " (2023). 
FUND AMEN TALS OF INFORM ATION TECHNOL OGY: T extbook – English . 19. 
https:/ /digitalcommons.usf.edu/dit_tb

In [57]:
import os
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb
from tqdm import tqdm
import logging
import uuid


# Set up logging
logging.basicConfig(level=logging.INFO)

# Set up directories and parameters
DOCUMENTS_DIR = os.path.join(os.getcwd(), "allDocs")  # Update with your document path
CHROMA_DB_PATH = os.path.join(os.getcwd(), "mychromadb")  # Update with your ChromaDB path
MAX_CHUNK_SIZE = 384  # Maximum chunk size for text

# Custom function to load PDF documents
def load_pdf_documents(directory):
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
            filepath = os.path.join(directory, filename)
            try:
                with open(filepath, 'rb') as file:
                    reader = PyPDF2.PdfReader(file)
                    text = ""
                    for page in reader.pages:
                        text += page.extract_text() or ""
                    documents.append({'text': text})
                logging.info(f"Loaded document: {filename}")
            except Exception as e:
                logging.error(f"Error loading document {filename}: {e}")
    return documents

# Step 1: Load Documents
def load_documents(directory):
    documents = load_pdf_documents(directory)
    if not documents:
        logging.warning("No documents found in the directory.")
    return documents

# Step 2: Split Text into Chunks
def split_text_into_chunks(text):
    # Recursive splitter with more aggressive splitting
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=384,      # Target chunk size
        chunk_overlap=20,    # Overlap to keep context
        separators=["\n", " ", ""]  # Break first by lines, then by spaces, then by characters
    )
    
    chunks = text_splitter.split_text(text)
    logging.info(f"Generated {len(chunks)} chunks.")
    return chunks

# Step 3: Create Embeddings
def create_embeddings(chunks):
    model = SentenceTransformer('all-MiniLM-L6-v2')  # Choose a suitable model
    embeddings = model.encode(chunks, show_progress_bar=True)
    return embeddings

# Step 4: Save to ChromaDB
def save_to_chromadb(chunks, embeddings):
    # Initialize ChromaDB client
    client = chromadb.Client()  # No path needed if using in-memory storage
    
    # Check if the collection already exists
    collection_name = "my_collection"
    
    # Retrieve existing collections
    existing_collections = client.list_collections()
    
    # Extract collection names properly from the returned objects
    existing_collection_names = [collection.name for collection in existing_collections]  # Use .name attribute
    
    if collection_name not in existing_collection_names:
        # Create a collection for document embeddings
        collection = client.create_collection(name=collection_name)
        logging.info(f"Created new collection: {collection_name}")
    else:
        logging.info(f"Collection {collection_name} already exists. Using the existing collection.")
        collection = client.get_collection(name=collection_name)

    # Add embeddings and their corresponding chunks to the collection
    for chunk, embedding in tqdm(zip(chunks, embeddings), total=len(chunks), desc="Saving to ChromaDB"):
        # Generate a unique ID for each chunk
        unique_id = str(uuid.uuid4())  # Generates a random UUID
        
        # Add the chunk, embedding, and unique ID to the collection
        collection.add(
            ids=[unique_id],          # Pass the unique ID
            documents=[chunk],        # Save the original text chunk
            embeddings=[embedding]    # Save the corresponding embedding
        )



def main():
    # Load documents
    logging.info("Loading documents...")
    documents = load_documents(DOCUMENTS_DIR)
    
    # Split documents into chunks
    logging.info("Splitting text into chunks...")
    chunks = split_text(documents)

    # Create embeddings for the chunks
    logging.info("Creating embeddings...")
    embeddings = create_embeddings(chunks)

    # Save embeddings to ChromaDB
    logging.info("Saving embeddings to ChromaDB...")
    save_to_chromadb(chunks, embeddings)

    logging.info("Process completed!")

if __name__ == "__main__":
    main()


INFO:root:Loading documents...
INFO:root:Loaded document: Algorithms - Erickson.pdf
INFO:root:Loaded document: Algorithms - Sedgewick, Wayne.pdf
INFO:root:Loaded document: Algorithms and Programming, Problems and Solutions - Shen.pdf
INFO:root:Loaded document: An Introduction to the Analysis of Algorithms - Flajolet.pdf
INFO:root:Loaded document: An Introduction to the Analysis of Algorithms - Soltys.pdf
INFO:root:Loaded document: Automata and Computability - Kozen.pdf
INFO:root:Loaded document: Fundamentals of Information Technology.pdf
INFO:root:Splitting text into chunks...
INFO:root:Creating embeddings...
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 1/1 [00:04<00:00,  4.68s/it]
INFO:root:Saving embeddings to ChromaDB...
INFO:root:Collection my_collection already exists. Using the existing collection.
Saving to ChromaDB: 100%|█