In [None]:
!pip install langchain chromadb sentence-transformers pymupdf


In [None]:
!pip install -q pypdf langchain openai faiss-cpu tiktoken sentence-transformers

In [None]:
!pip install -q "unstructured[pdf]"

In [None]:
!pip install -q gradio

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
pdf_folder_path = '/content/drive/MyDrive/LexAi_pdfs'

In [None]:
!pip install -q langchain-community unstructured pypdf

In [None]:
import os
from pypdf import PdfReader
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_pdfs_from_folder(folder_path):
    documents = []
    for file in os.listdir(folder_path):
        if file.endswith('.pdf'):
            file_path = os.path.join(folder_path, file)
            try:
                # Try structured PDF first
                reader = PdfReader(file_path)
                text = ""
                for page in reader.pages:
                    text += page.extract_text()
                if len(text) > 100:  # If we got reasonable text
                    documents.append({
                        'source': file,
                        'text': text,
                        'pages': len(reader.pages)
                    })
                else:
                    # Fall back to unstructured loader
                    loader = UnstructuredPDFLoader(file_path)
                    data = loader.load()
                    documents.append({
                        'source': file,
                        'text': data[0].page_content,
                        'pages': 'unknown'
                    })
            except Exception as e:
                print(f"Error processing {file}: {str(e)}")
    return documents

# Load all PDFs
legal_docs = load_pdfs_from_folder(pdf_folder_path)
print(f"Loaded {len(legal_docs)} PDF documents")

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    length_function=len,
    separators=["\n\n", "\n", "(?<=\\. )", " ", ""]
)

document_chunks = []
for doc in legal_docs:
    chunks = text_splitter.split_text(doc['text'])
    for i, chunk in enumerate(chunks):
        document_chunks.append({
            'text': chunk,
            'source': doc['source'],
            'chunk_num': i+1,
            'metadata': {
                'pages': doc['pages'],
                'file_name': doc['source']
            }
        })

print(f"Created {len(document_chunks)} text chunks from {len(legal_docs)} documents")

In [None]:
!pip install -q sentence-transformers chromadb langchain-chroma

In [None]:
!pip install -q langchain-huggingface sentence-transformers

In [None]:
# Step 5: Embed the chunks
from langchain_huggingface import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)


In [None]:
# Step 6: Convert chunks to LangChain Documents
from langchain_core.documents import Document

documents = [
    Document(
        page_content=chunk["text"],
        metadata={"source": chunk["source"], "chunk_num": chunk["chunk_num"]}
    ) for chunk in document_chunks
]

In [None]:
!pip install -q chromadb langchain-chroma

In [None]:
# Step 7: Create Chroma vector DB
from langchain_chroma import Chroma

vector_db = Chroma.from_documents(
    documents=documents,
    embedding=embedding_model,
    persist_directory="./legal_docs_chromadb"
)

print(f"Created ChromaDB with {len(documents)} legal document chunks!")


In [None]:
# Step 8: Create Retriever with top-k filtering
retriever = vector_db.as_retriever(search_kwargs={
    "k": 3
})

In [None]:
!pip install -q transformers torch

In [None]:
# Step 9: Load LLM (Flan-T5)
from transformers import pipeline
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

hf_pipe = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    device="cpu"
)

llm = HuggingFacePipeline(pipeline=hf_pipe)


In [None]:
# Step 10: Create RetrievalQA Chain
legal_qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

In [None]:
# Step 11: Ask Legal Question
query = "What is the punishment under Section 302 of the PPC?"
result = legal_qa.invoke({"query": query})

print("\n⚖️ Legal Answer:")
print(result["result"])

print("\n📚 Source Documents:")
for doc in result["source_documents"]:
    print(f"- {doc.metadata['source']} (Chunk {doc.metadata['chunk_num']})")


In [None]:
import shutil

# Save and zip the vector DB folder
shutil.make_archive('/content/legal_docs_chromadb', 'zip', './legal_docs_chromadb')


In [None]:
from google.colab import files
files.download('/content/legal_docs_chromadb.zip')