In [None]:
!pip install boto3 pdfplumber chromadb langchain pymupdf pytesseract pillow tqdm

In [None]:
import boto3
import pdfplumber
import re
import json
import io
import chromadb
from chromadb.config import Settings
from botocore.exceptions import ClientError
from langchain.text_splitter import RecursiveCharacterTextSplitter
import fitz
import pytesseract
from PIL import Image
from tqdm.notebook import tqdm
import pandas as pd

# AWS setup
s3_client = boto3.client('s3')
bedrock_client = boto3.client('bedrock-runtime')
bucket_name = 'your-bucket-name'  # Replace with your S3 bucket name
prefix = 'path/to/pdfs/'  # Optional folder in bucket

# Initialize Chroma
chroma_client = chromadb.Client(Settings(persist_directory=None))
collection = chroma_client.create_collection(name="pdf_embeddings")

# Semantic text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=30000,
    chunk_overlap=200,
    separators=["\n\n", "\n", ". ", " ", ""],
    length_function=len
)

In [None]:
def generate_titan_embedding(text, model_id="amazon.titan-embed-text-v2:0", dimensions=1024, normalize=True):
    try:
        body = json.dumps({
            "inputText": text,
            "dimensions": dimensions,
            "normalize": normalize
        })
        response = bedrock_client.invoke_model(
            body=body,
            modelId=model_id,
            accept='application/json',
            contentType='application/json'
        )
        response_body = json.loads(response['body'].read())
        return response_body['embedding']
    except ClientError as e:
        print(f"Error generating embedding: {e}")
        return None

def extract_urls(text):
    url_pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
    return re.findall(url_pattern, text)

def process_pdf(file_content, file_key, include_images=False):
    chunks = []
    metadata = []
    with pdfplumber.open(io.BytesIO(file_content)) as pdf:
        for page_num, page in enumerate(pdf):
            text = page.extract_text() or ""
            if text:
                text_chunks = text_splitter.split_text(text)
                for chunk in text_chunks:
                    chunks.append(chunk)
                    metadata.append({"file": file_key, "page": page_num + 1, "type": "text"})
            tables = page.extract_tables()
            for table in tables:
                table_text = "\n".join([",".join(row) for row in table if row])
                if table_text:
                    chunks.append(table_text[:40000])
                    metadata.append({"file": file_key, "page": page_num + 1, "type": "table"})
            urls = extract_urls(text)
            for url in urls:
                chunks.append(url)
                metadata.append({"file": file_key, "page": page_num + 1, "type": "url"})
    if include_images:
        doc = fitz.open(stream=file_content, filetype="pdf")
        for page_num, page in enumerate(doc):
            images = page.get_images()
            for img in images:
                xref = img[0]
                base_image = doc.extract_image(xref)
                image = Image.open(io.BytesIO(base_image["image"]))
                text = pytesseract.image_to_string(image)
                if text.strip():
                    image_chunks = text_splitter.split_text(text)
                    for chunk in image_chunks:
                        chunks.append(chunk)
                        metadata.append({"file": file_key, "page": page_num + 1, "type": "image"})
        doc.close()
    return chunks, metadata

In [None]:
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
pdf_files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.pdf')]

results = []
for file_key in tqdm(pdf_files, desc="Processing PDFs"):
    print(f"Processing {file_key}...")
    try:
        obj = s3_client.get_object(Bucket=bucket_name, Key=file_key)
        file_content = obj['Body'].read()
    except ClientError as e:
        print(f"Error downloading {file_key}: {e}")
        continue
    try:
        chunks, metadata = process_pdf(file_content, file_key, include_images=False)
    except Exception as e:
        print(f"Error processing {file_key}: {e}")
        continue
    embeddings = []
    valid_chunks = []
    valid_metadata = []
    for chunk, meta in zip(chunks, metadata):
        embedding = generate_titan_embedding(chunk)
        if embedding:
            embeddings.append(embedding)
            valid_chunks.append(chunk)
            valid_metadata.append(meta)
    if embeddings:
        collection.add(
            embeddings=embeddings,
            documents=valid_chunks,
            metadatas=valid_metadata,
            ids=[f"{file_key}_chunk_{i}" for i in range(len(valid_chunks))]
        )
    results.append({
        "file": file_key,
        "num_chunks": len(valid_chunks),
        "num_embeddings": len(embeddings)
    })

pd.DataFrame(results)

In [None]:
query = "Find information about machine learning"
query_embedding = generate_titan_embedding(query)
results = collection.query(query_embeddings=[query_embedding], n_results=5)

query_results = []
for doc, meta, dist in zip(results['documents'][0], results['metadatas'][0], results['distances'][0]):
    query_results.append({
        "file": meta['file'],
        "page": meta['page'],
        "type": meta['type'],
        "distance": dist,
        "content": doc[:200] + "..." if len(doc) > 200 else doc
    })

pd.DataFrame(query_results)