In [222]:
import os
# Ensure the Google API key is set as an environment variable.
# (Replace 'YOUR_GOOGLE_API_KEY' with your actual key or use another method to supply the key.)
if not os.getenv("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = "AIzaSyADCKnydIYN5CZiYfuNaswxGB5ZjspeOh8"

from pypdf import PdfReader                              # PyPDF for PDF text extraction:contentReference[oaicite:4]{index=4}
import easyocr                                           # EasyOCR for image (JPG) text extraction:contentReference[oaicite:5]{index=5}:contentReference[oaicite:6]{index=6}
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_google_genai import GoogleGenerativeAI, ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import RetrievalQA
from langchain_community.vectorstores.faiss import FAISS    # FAISS vector store for embeddings:contentReference[oaicite:7]{index=7}
import gradio as gr
import argparse


In [223]:
embed_model = None
llm = None
chat_llm = None
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
vector_store = None
qa_chain = None

In [224]:
def initialize_models():
    """Initialize the Google Gemini models with error handling."""
    global embed_model, llm, chat_llm
    try:
        print("Initializing Google Gemini models...")
        embed_model = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")
        llm = GoogleGenerativeAI(model="gemini-2.0-flash")
        chat_llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")
        print("Models initialized successfully!")
        return True
    except Exception as e:
        print(f"Error initializing models: {e}")
        return False

In [225]:
def extract_text_from_pdf(file_path):
    """Extract text from a PDF file using PyPDF."""
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

In [226]:
def extract_text_from_image(file_path):
    """Extract English text from an image (JPG) using EasyOCR."""
    reader = easyocr.Reader(['en'], gpu=False)
    # detail=0 returns only the detected text strings:contentReference[oaicite:11]{index=11}
    results = reader.readtext(file_path, detail=0)
    text = " ".join(results)
    return text


In [227]:
def process_file(uploaded_file):
    """Handle file upload: extract text, split into chunks, embed, build FAISS index, and compute summary and clauses."""
    global vector_store, qa_chain, embed_model, llm, chat_llm
    
    # Initialize models if not already done
    if embed_model is None or llm is None or chat_llm is None:
        if not initialize_models():
            return "Error: Could not initialize AI models. Please check your Google API key.", ""
    
    try:
        file_path = uploaded_file.name
        
        # Determine file type by extension
        if file_path.lower().endswith('.pdf'):
            text = extract_text_from_pdf(file_path)
        elif file_path.lower().endswith(('.jpg', '.jpeg', '.png')):
            text = extract_text_from_image(file_path)
        else:
            return "Unsupported file format", ""
        
        if not text.strip():
            return "No text could be extracted from the file.", ""
        
        # Split text into chunks for embedding and retrieval
        chunks = text_splitter.split_text(text)
        
        # Convert chunks into Documents for summarization
        documents = [Document(page_content=chunk) for chunk in chunks]
        
        # Create or recreate the FAISS vector store with Gemini embeddings
        vector_store = FAISS.from_texts(chunks, embedding=embed_model)
        
        # Create a Retriever for RAG (could use max marginal relevance for diversity)
        retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})
        
        # Build the RetrievalQA chain using the chat LLM
        from langchain.chains import RetrievalQA
        qa_chain = RetrievalQA.from_chain_type(
            llm=chat_llm,
            chain_type="stuff",
            retriever=retriever,
            return_source_documents=False
        )
        
        # Summarize the document using LangChain's summarize chain (map-reduce for large docs)
        from langchain.chains.summarize import load_summarize_chain
        summarize_chain = load_summarize_chain(llm, chain_type="map_reduce")
        
        # FIX: Pass documents directly, not as invoke() parameter
        summary_result = summarize_chain.run(documents)
        
        # Extract summary text properly
        if isinstance(summary_result, dict):
            summary = summary_result.get('output_text', str(summary_result))
        else:
            summary = str(summary_result)
        
        # Identify main clauses using the RAG QA chain
        clause_query = "List the main clauses in the document and briefly describe each clause."
        
        # FIX: Use run() method for RetrievalQA chain
        try:
            clauses_result = qa_chain.run(clause_query)
            clauses = str(clauses_result)
        except Exception as clause_error:
            print(f"Error getting clauses: {clause_error}")
            clauses = "Could not extract clauses from the document."
        
        return summary, clauses
        
    except Exception as e:
        print(f"Detailed error: {str(e)}")
        return f"Error processing file: {str(e)}", ""

In [228]:
# def answer_question(user_input, chat_history):
#     """Answer a user question by retrieving relevant chunks and using the QA chain."""
#     global qa_chain, embed_model, llm, chat_llm
    
#     # Initialize models if not already done
#     if embed_model is None or llm is None or chat_llm is None:
#         if not initialize_models():
#             bot_response = "Error: Could not initialize AI models. Please check your Google API key."
#         else:
#             bot_response = "Please upload a document first."
#     elif not qa_chain:
#         # No document has been uploaded yet
#         bot_response = "Please upload a document first."
#     else:
#         try:
#             # Append user message to chat history and query the QA chain
#             bot_response = qa_chain.invoke({"input": user_input, "chat_history": chat_history or []})
#         except Exception as e:
#             bot_response = f"Error processing question: {str(e)}"

#     # Update chat history: list of {"role": "...", "content": "..."}
#     chat_history = chat_history or []
#     chat_history.append({"role": "user", "content": user_input})
#     chat_history.append({"role": "assistant", "content": bot_response})
#     # Clear the input box by returning "" for user_input
#     return "", chat_history
def answer_question(user_input, chat_history):
    """Answer questions using the RAG chain with proper chat history handling."""
    global qa_chain
    
    if not qa_chain:
        return "Please upload a document first.", chat_history or []
    
    try:
        # Use run() method for simple question answering
        bot_response = qa_chain.run(user_input)
        
    except Exception as e:
        bot_response = f"Error processing question: {e}"
    
    # Update chat history
    chat_history = chat_history or []
    chat_history.append({"role": "user", "content": user_input})
    chat_history.append({"role": "assistant", "content": bot_response})
    
    return bot_response, chat_history


In [229]:
with gr.Blocks() as demo:
    gr.Markdown("# Legal Document Analysis with Google Gemini")
    gr.Markdown(
        "Upload a PDF or image of a legal document. The app will summarize the document, identify its main clauses, "
        "and allow you to ask questions about the content."
    )
    
    # Status indicator
    status_text = gr.Textbox(label="Status", value="Ready to process documents. Models will be initialized when needed.", interactive=False)
    
    with gr.Row():
        file_input = gr.File(label="Upload Document (PDF or JPG)", type="filepath")
        summary_box = gr.Textbox(label="Document Summary", lines=5)
        clauses_box = gr.Textbox(label="Identified Clauses", lines=5)
        file_input.change(
            fn=process_file,
            inputs=[file_input],
            outputs=[summary_box, clauses_box]
        )

    # Chat-style Q&A interface
    gr.Markdown("## Ask Questions")
    chatbot = gr.Chatbot(label="Chat with Document", type="messages")
    msg = gr.Textbox(placeholder="Enter your question about the document...")
    msg.submit(answer_question, [msg, chatbot], [msg, chatbot])

print("Starting Legal Document Analysis App...")
demo.launch(share=False, server_name="0.0.0.0", server_port=7878)

Starting Legal Document Analysis App...


ERROR:    [Errno 48] error while attempting to bind on address ('0.0.0.0', 7878): [errno 48] address already in use


OSError: Cannot find empty port in range: 7878-7878. You can specify a different port by setting the GRADIO_SERVER_PORT environment variable or passing the `server_port` parameter to `launch()`.

In [None]:
import os

class MockUpload:
    def __init__(self, path):
        self.name = path

# Set your API key
os.environ.setdefault("GOOGLE_API_KEY", "AIzaSyADCKnydIYN5CZiYfuNaswxGB5ZjspeOh8")

# Test the function
local_path = "1.pdf"
mock_file = MockUpload(local_path)

# Try the main function first
summary, clauses = process_file(mock_file)
print("=== DOCUMENT SUMMARY ===\n")
print(summary)
print("\n=== IDENTIFIED CLAUSES ===\n")
print(clauses)

=== DOCUMENT SUMMARY ===

Error processing file: Invalid input type <class 'dict'>. Must be a PromptValue, str, or list of BaseMessages.

=== IDENTIFIED CLAUSES ===


