In [None]:
''' Install dependencies '''
#pip install pypdf
#pip install langchain langchain_community langchain_core langchain_openai langchain_text_splitters
#pip install chromadb
#pip install langchainhub
#pip install -U langchain-anthropic

In [7]:
''' Load Claude AI API key from .env '''
import os
import dotenv
from dotenv import load_dotenv

load_dotenv()
os.environ["ANTHROPIC_API_KEY"] = os.getenv("ANTHROPIC_API_KEY")

In [4]:
''' Set up necessary libraries and environment '''
import bs4
import langchainhub
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_anthropic import ChatAnthropic

In [41]:
''' Set up document loader '''
from langchain_community.document_loaders import PyPDFLoader #alternative: PyMuPDFLoader
from langchain_community.document_loaders import PyMuPDFLoader #alternative: PyMuPDFLoader

policy_pdf = os.getenv("PDF_PATH")

loader = PyMuPDFLoader(policy_pdf, extract_tables_settings={"enabled": True})

documents = loader.load()

In [31]:
''' Redact sensitive information from documents (PII)'''

import re
from langchain_core.documents import Document

policy_number = input("Enter your policy number (used only for one-time redaction): ").strip()
redaction_patterns = {
    "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b",
    "government_id": r"\b\d{6}-\d{2}-\d{4}\b",  # 123456-78-9000
}

def redact_sensitive_info(text, policy_number):
    redacted = text
    for pattern_name, pattern in redaction_patterns.items():
        redacted = re.sub(pattern, f"[Redacted_{pattern_name.upper()}]", redacted)
    redacted = re.sub(re.escape(policy_number), "[Redacted_policy_number]", redacted)
    return redacted

redacted_documents = []

for doc in documents: 
    redacted_content = redact_sensitive_info(doc.page_content, policy_number=policy_number)
    redacted_documents.append(
        Document(page_content=redacted_content))
    
documents = redacted_documents

In [42]:
documents[0].page_content[:500]  # Display snippet of redacted text

'\uf0b7\n\uf0b7\n\uf0b7\n\uf0b7\n\uf0b7\n\uf0b7'

In [43]:
print("Number of pages loaded:", len(documents))
print("Page 1 text snippet:\n", documents[0].page_content[:1000])

Number of pages loaded: 84
Page 1 text snippet:
 





