In [None]:
!pip show boto3 pdfplumber chromadb langchain tqdm

In [None]:
import boto3
import pdfplumber
import re
import json
import io
import chromadb
from chromadb.config import Settings
from botocore.exceptions import ClientError
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm import tqdm
import pandas as pd
import os
import shutil

# AWS setup
s3_client = boto3.client('s3')
bedrock_client = boto3.client('bedrock-runtime')
bucket_name = 'your-bucket-name'  # Replace with your S3 bucket name
prefix = 'path/to/pdfs/'

# Initialize Chroma
chroma_client = chromadb.Client(Settings(persist_directory=None))
collection = chroma_client.create_collection(name="pdf_embeddings")

# Semantic text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=30000,
    chunk_overlap=200,
    separators=["\n\n", "\n", ". ", " ", ""],
    length_function=len
)

# Folder setup
temp_folder = "temp-pdf"
if os.path.exists(temp_folder):
    shutil.rmtree(temp_folder)  # Delete folder and contents
os.makedirs(temp_folder)

In [None]:
def generate_titan_embedding(text, model_id="amazon.titan-embed-text-v2:0", dimensions=1024, normalize=True):
    try:
        body = json.dumps({
            "inputText": text,
            "dimensions": dimensions,
            "normalize": normalize
        })
        response = bedrock_client.invoke_model(
            body=body,
            modelId=model_id,
            accept='application/json',
            contentType='application/json'
        )
        response_body = json.loads(response['body'].read())
        return response_body['embedding']
    except ClientError as e:
        print(f"Error generating embedding: {e}")
        return None

def extract_urls(text):
    url_pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
    return re.findall(url_pattern, text)

def process_pdf(file_content, file_key, temp_folder):
    chunks = []
    metadata = []
    file_key_safe = file_key.replace('/', '_')  # Replace invalid characters for filenames
    with pdfplumber.open(io.BytesIO(file_content)) as pdf:
        for page_num, page in enumerate(pdf):
            text = page.extract_text() or ""
            if text:
                text_chunks = text_splitter.split_text(text)
                for idx, chunk in enumerate(text_chunks):
                    chunks.append(chunk)
                    meta = {"file": file_key, "page": page_num + 1, "type": "text"}
                    metadata.append(meta)
                    # Save chunk to file
                    chunk_file = os.path.join(temp_folder, f"{file_key_safe}_page{page_num+1}_text_{idx}.txt")
                    with open(chunk_file, 'w', encoding='utf-8') as f:
                        f.write(f"Metadata: {json.dumps(meta)}\n\nContent:\n{chunk}")
            tables = page.extract_tables()
            for table_idx, table in enumerate(tables):
                table_text = "\n".join([",".join(row) for row in table if row])
                if table_text:
                    chunks.append(table_text[:40000])
                    meta = {"file": file_key, "page": page_num + 1, "type": "table"}
                    metadata.append(meta)
                    # Save table to file
                    table_file = os.path.join(temp_folder, f"{file_key_safe}_page{page_num+1}_table_{table_idx}.txt")
                    with open(table_file, 'w', encoding='utf-8') as f:
                        f.write(f"Metadata: {json.dumps(meta)}\n\nContent:\n{table_text}")
            urls = extract_urls(text)
            for url_idx, url in enumerate(urls):
                chunks.append(url)
                meta = {"file": file_key, "page": page_num + 1, "type": "url"}
                metadata.append(meta)
                # Save URL to file
                url_file = os.path.join(temp_folder, f"{file_key_safe}_page{page_num+1}_url_{url_idx}.txt")
                with open(url_file, 'w', encoding='utf-8') as f:
                    f.write(f"Metadata: {json.dumps(meta)}\n\nContent:\n{url}")
    return chunks, metadata

In [None]:
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
pdf_files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.pdf')]

results = []
for file_key in tqdm(pdf_files, desc="Processing PDFs"):
    print(f"Processing {file_key}...")
    try:
        obj = s3_client.get_object(Bucket=bucket_name, Key=file_key)
        file_content = obj['Body'].read()
    except ClientError as e:
        print(f"Error downloading {file_key}: {e}")
        continue
    try:
        chunks, metadata = process_pdf(file_content, file_key, temp_folder)
    except Exception as e:
        print(f"Error processing {file_key}: {e}")
        continue
    embeddings = []
    valid_chunks = []
    valid_metadata = []
    for chunk, meta in zip(chunks, metadata):
        embedding = generate_titan_embedding(chunk)
        if embedding:
            embeddings.append(embedding)
            valid_chunks.append(chunk)
            valid_metadata.append(meta)
    if embeddings:
        collection.add(
            embeddings=embeddings,
            documents=valid_chunks,
            metadatas=valid_metadata,
            ids=[f"{file_key}_chunk_{i}" for i in range(len(valid_chunks))]
        )
    results.append({
        "file": file_key,
        "num_chunks": len(valid_chunks),
        "num_embeddings": len(embeddings),
        "saved_files": [f for f in os.listdir(temp_folder) if f.startswith(file_key.replace('/', '_'))]
    })

pd.DataFrame(results)

In [None]:
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
pdf_files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.pdf')]

results = []
for file_key in tqdm(pdf_files, desc="Processing PDFs"):
    print(f"Processing {file_key}...")
    try:
        obj = s3_client.get_object(Bucket=bucket_name, Key=file_key)
        file_content = obj['Body'].read()
    except ClientError as e:
        print(f"Error downloading {file_key}: {e}")
        continue
    try:
        chunks, metadata = process_pdf(file_content, file_key, include_images=False)
    except Exception as e:
        print(f"Error processing {file_key}: {e}")
        continue
    embeddings = []
    valid_chunks = []
    valid_metadata = []
    for chunk, meta in zip(chunks, metadata):
        embedding = generate_titan_embedding(chunk)
        if embedding:
            embeddings.append(embedding)
            valid_chunks.append(chunk)
            valid_metadata.append(meta)
    if embeddings:
        collection.add(
            embeddings=embeddings,
            documents=valid_chunks,
            metadatas=valid_metadata,
            ids=[f"{file_key}_chunk_{i}" for i in range(len(valid_chunks))]
        )
    results.append({
        "file": file_key,
        "num_chunks": len(valid_chunks),
        "num_embeddings": len(embeddings)
    })

pd.DataFrame(results)

In [None]:
query = "Find information about machine learning"
query_embedding = generate_titan_embedding(query)
results = collection.query(query_embeddings=[query_embedding], n_results=5)

query_results = []
for doc, meta, dist in zip(results['documents'][0], results['metadatas'][0], results['distances'][0]):
    query_results.append({
        "file": meta['file'],
        "page": meta['page'],
        "type": meta['type'],
        "distance": dist,
        "content": doc[:200] + "..." if len(doc) > 200 else doc
    })

pd.DataFrame(query_results)