<a href="https://colab.research.google.com/github/devanshiiii20/Demystify-LegalDocs/blob/main/demystifying_docs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install google-cloud-documentai google-cloud-aiplatform vertexai

In [None]:
from google.cloud import documentai_v1 as documentai
from vertexai.language_models import TextGenerationModel
import vertexai
from textwrap import wrap

In [None]:
import os

# Paths of files already uploaded in Colab Files section
SERVICE_ACCOUNT_KEY_PATH = "/content/demystifying-legal-docs-656e8c1f99a1.json"
SAMPLE_DOC_PATH = "/content/Non Disclosure Agreement.pdf"

# Set the environment variable for Google credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = SERVICE_ACCOUNT_KEY_PATH

PROJECT_ID = "demystifying-legal-docs"
LOCATION = "us"
PROCESSOR_ID = "cc201b11f66615f5"

print("✅ Using JSON key:", SERVICE_ACCOUNT_KEY_PATH)
print("✅ Using sample document:", SAMPLE_DOC_PATH)

In [None]:
!pip install --upgrade streamlit google-cloud-documentai google-cloud-aiplatform vertexai pyngrok -q

In [None]:
import os

# Path of JSON key already uploaded in Colab Files section
SERVICE_ACCOUNT_KEY_PATH = "/content/demystifying-legal-docs-656e8c1f99a1.json"

# Set environment variable
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = SERVICE_ACCOUNT_KEY_PATH

print("✅ Google Application Credentials set to:", SERVICE_ACCOUNT_KEY_PATH)

In [None]:
!pip install PyMuPDF -q

In [None]:
!pip install pyngrok

In [None]:
%%writefile /content/demystifying_docs.py
import os
from google.cloud import documentai_v1 as documentai
from vertexai.generative_models import GenerativeModel, GenerationConfig
import vertexai
from textwrap import wrap
import time
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from google.api_core import exceptions
from google.oauth2 import service_account

# Project info
PROJECT_ID = "demystifying-legal-docs"
LOCATION = "us"
PROCESSOR_ID = "cc201b11f66615f5"
VERTEX_AI_LOCATION = "us-central1"

SERVICE_ACCOUNT_KEY_PATH = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")

credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_KEY_PATH)
vertexai.init(project=PROJECT_ID, location=VERTEX_AI_LOCATION, credentials=credentials)

def extract_text_from_document(content):
    print("Starting text extraction from document...")
    client = documentai.DocumentProcessorServiceClient.from_service_account_file(
        SERVICE_ACCOUNT_KEY_PATH
    )
    name = f"projects/{PROJECT_ID}/locations/{LOCATION}/processors/{PROCESSOR_ID}"
    raw_document = {"content": content, "mime_type": "application/pdf"}
    request = {"name": name, "raw_document": raw_document}
    response = client.process_document(request=request)
    print("Text extraction completed.")
    return response.document.text

def simplify_text(prompt, max_retries=5, initial_delay=1):
    print("Starting text simplification...")
    model = GenerativeModel("gemini-2.0-flash-lite")
    generation_config = GenerationConfig(max_output_tokens=512)
    delay = initial_delay
    for i in range(max_retries):
        try:
            resp = model.generate_content(
                f"""Summarize the following legal text in plain professional English.
                RULES:
                - Use only full sentences in paragraph form.
                - Do not use bullets, numbering, stars, or markdown formatting.
                - Do not use casual phrases like 'okay', 'let’s break this down', etc.
                - Do not repeat the same information more than once.
                - Keep it concise: no more than three short paragraphs.
                - The style must be clear, formal and explanatory.

                Text to simplify:
                {prompt}
                """,
                generation_config=generation_config
            )
            print("Text simplification completed.")
            return resp.text.strip()
        except exceptions.ResourceExhausted as e:
            if i < max_retries - 1:
                print(f"ResourceExhausted error: {e}. Retrying in {delay} seconds...")
                time.sleep(delay)
                delay *= 2
            else:
                print(f"ResourceExhausted error: {e}. Max retries reached.")
                raise
        except Exception as e:
            print(f"An unexpected error occurred: {e}. Retrying in {delay} seconds...")
            time.sleep(delay)
            delay *= 2
    return ""

def simplify_long_text(text, chunk_size=800):
    print("Starting long text simplification...")
    chunks = wrap(text, chunk_size)
    parts = []
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_chunk = {executor.submit(simplify_text, chunk): i for i, chunk in enumerate(chunks)}
        for future in as_completed(future_to_chunk):
            index = future_to_chunk[future]
            try:
                simplified_chunk = future.result()
                parts.append((index, simplified_chunk))
                print(f"Processed chunk {index+1}/{len(chunks)}...")
            except Exception as exc:
                print(f'Chunk {index+1} generated an exception: {exc}')
                parts.append((index, f"Error processing chunk {index+1}"))

    parts.sort(key=lambda x: x[0])
    return " ".join([part[1] for part in parts])

def explain_jargon(text, max_retries=5, initial_delay=1):
    print("Starting jargon explanation...")
    model = GenerativeModel("gemini-2.0-flash-lite")
    generation_config = GenerationConfig(max_output_tokens=600)
    delay = initial_delay
    for i in range(max_retries):
        try:
            prompt = f"""
           You are a legal assistant.
            Read the legal text below and explain every legal term or jargon in clear, simple English.

            Rules:
            - Do NOT use markdown or symbols like **, *, -, >, #.
            - Present each explanation as a plain sentence or numbered item.
            - Avoid repeating the same explanation.
            - Be concise but clear.

            Text:
            {text}
            """
            resp = model.generate_content(prompt, generation_config=generation_config)
            cleaned = re.sub(r"\*+", "", resp.text)
            cleaned = re.sub(r"#+", "", cleaned)
            cleaned = re.sub(r"•", "-", cleaned)
            print("Jargon explanation completed.")
            return cleaned.strip()
        except exceptions.ResourceExhausted as e:
            if i < max_retries - 1:
                print(f"ResourceExhausted error: {e}. Retrying in {delay} seconds...")
                time.sleep(delay)
                delay *= 2
            else:
                print(f"ResourceExhausted error: {e}. Max retries reached.")
                raise
        except Exception as e:
            print(f"An unexpected error occurred: {e}. Retrying in {delay} seconds...")
            time.sleep(delay)
            delay *= 2
    return ""

def simplify_long_text_sequential(text, chunk_size=800, delay_between_chunks=5):
    print("Starting sequential long text simplification...")
    chunks = wrap(text, chunk_size)
    parts = []
    for i, c in enumerate(chunks):
        print(f"Processing chunk {i+1}/{len(chunks)}...")
        simplified_chunk = simplify_text(c)
        parts.append(simplified_chunk)
        time.sleep(delay_between_chunks)
    return " ".join(parts)


In [None]:
import streamlit as st
from demystifying_docs import extract_text_from_document, simplify_long_text, explain_jargon
from google.cloud import translate_v2 as translate
import base64, time
import os

# Paths for already uploaded files in Colab
SERVICE_ACCOUNT_KEY_PATH = "/content/demystifying-legal-docs-656e8c1f99a1.json"
SAMPLE_DOC_PATH = "/content/Non Disclosure Agreement.pdf"

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = SERVICE_ACCOUNT_KEY_PATH

# PDF preview function
def show_pdf(file_bytes):
    base64_pdf = base64.b64encode(file_bytes).decode('utf-8')
    pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
    st.markdown(pdf_display, unsafe_allow_html=True)

# Translate text function
def translate_text(text, target_language):
    client = translate.Client.from_service_account_json(SERVICE_ACCOUNT_KEY_PATH)
    lang_map = {"English": "en", "Hindi": "hi"}
    if target_language == "English":
        return text
    result = client.translate(text, target_language=lang_map[target_language])
    return result["translatedText"]

# Risk detection function
def detect_risks(text):
    risks = []
    risk_keywords = {
        "Penalty / Late Fees": ["penalty", "late fee", "fine"],
        "Termination / Lock-in": ["termination", "lock-in", "binding period", "expiry"],
        "Payment / Interest": ["interest", "payment", "due", "loan"],
        "Confidentiality": ["confidential", "non-disclosure", "nda"],
        "Liability": ["liability", "indemnify", "responsible"]
    }
    lowered = text.lower()
    for category, keywords in risk_keywords.items():
        for kw in keywords:
            if kw in lowered:
                risks.append(f"⚠️ {category}: contains '{kw}'")
                break
    return risks if risks else ["✅ No major risks detected."]

# Streamlit UI setup
st.set_page_config(
    page_title="Demystify Legal Docs",
    page_icon="📑",
    layout="wide",
)

st.markdown("<p class='title'>📑 Demystify Legal Docs</p>", unsafe_allow_html=True)
st.markdown("<p class='subtitle'>Upload your legal document and get a simplified version instantly.</p>", unsafe_allow_html=True)

# Sidebar language selection
language = st.sidebar.selectbox("🌍 Choose output language", ["English", "Hindi"])

st.sidebar.title("ℹ️ About")
st.sidebar.info("Demystify Legal Docs is a GenAI-powered tool that makes legal language accessible. It extracts contracts and judgments using Document AI, simplifies them with Vertex AI, explains jargon in plain English, and highlights hidden risks. Built to bridge the gap between law and people, so everyone can make informed decisions.\n")

# Use sample document directly
use_sample = st.checkbox("📂 Try with Sample Document")

if use_sample:
    with open(SAMPLE_DOC_PATH, "rb") as f:
        file_bytes = f.read()
else:
    uploaded_pdf = st.file_uploader("📂 Upload a PDF", type="pdf")
    file_bytes = uploaded_pdf.read() if uploaded_pdf else None

if file_bytes:
    st.write("Extracting text…")
    raw_text = extract_text_from_document(file_bytes)

    tabs = st.tabs([
        "📝 Simplified Text",
        "📑 Original PDF",
        "📘 Legal Jargon Explained",
        "🔍 Summary",
        "⚠️ Risks"
    ])

    # Simplified Text tab
    with tabs[0]:
        with st.spinner("⚡ Simplifying your document... Please wait."):
            simplified = simplify_long_text(raw_text)
        simplified_out = translate_text(simplified, language)
        st.text_area("", simplified_out, height=400)
        st.download_button("✨ Download Simplified Doc", data=simplified_out, file_name="simplified.txt")

    # Original PDF tab
    with tabs[1]:
        show_pdf(file_bytes)

    # Legal Jargon Explained tab
    with tabs[2]:
        with st.spinner("📘 Explaining legal jargon..."):
            jargon_explained = explain_jargon(raw_text[:1500])
        jargon_out = translate_text(jargon_explained, language)
        st.text_area("", jargon_out, height=400)
        st.download_button("📘 Download Jargon Explanation", data=jargon_out, file_name="jargon_explanation.txt")

    # Summary tab
    with tabs[3]:
        with st.spinner("🧾 Generating summary..."):
            time.sleep(5)
            summary = simplify_long_text("Summarize this document in plain English:\n" + raw_text[:2000])
        summary_out = translate_text(summary, language)
        st.text_area("", summary_out, height=250)
        st.download_button("🔍 Download Summary", data=summary_out, file_name="summary.txt")

    # Risks tab
    with tabs[4]:
        with st.spinner("🔎 Checking for risks..."):
            risks = detect_risks(raw_text)
        for r in risks:
            st.write(r)

st.sidebar.markdown("---")
st.sidebar.markdown(
    """
    <div class="sidebar-footer">
    👩‍💻 TEAM : AC/PC
    </div>
    """,
    unsafe_allow_html=True
)

In [None]:
import os

folder_path = "/content/demystify"
os.makedirs(folder_path, exist_ok=True)

files_in_folder = os.listdir(folder_path)
if files_in_folder:
    print(f"✅ Files in '{folder_path}':")
    for file_name in files_in_folder:
        print(f" - {file_name}")
else:
    print(f"⚠️ No files found in '{folder_path}'. You can upload files manually to this folder using the left sidebar uploader in Colab.")

In [None]:
from pyngrok import ngrok

NGROK_AUTHTOKEN = "31haLAEOJZKFwD6QLriY5QTjjgZ_2ABAMCxWb3Hu1iFmaNvVL"

ngrok.set_auth_token(NGROK_AUTHTOKEN)
print("✅ ngrok authtoken set successfully. You won't need to enter it again.")


In [None]:
import shutil, os

repo_folder = "/content/Demystify-LegalDocs"
os.makedirs(repo_folder, exist_ok=True)

# Copy files
shutil.copy("/content/app.py", repo_folder)
shutil.copy("/content/demystifying_docs.py", repo_folder)

# (optional) copy your sample doc
if os.path.exists("/content/Non Disclosure Agreement.pdf"):
    shutil.copy("/content/Non Disclosure Agreement.pdf", repo_folder)

# (optional) copy JSON key (⚠️ only if you plan to .gitignore it later!)
if os.path.exists("/content/demystifying-legal-docs-656e8c1f99a1.json"):
    shutil.copy("/content/demystifying-legal-docs-656e8c1f99a1.json", repo_folder)

print("✅ All files copied to:", repo_folder)

In [None]:
from pyngrok import ngrok
import time
import os
import subprocess
import shutil

ngrok.kill()
time.sleep(2)

# Paths for sample PDF and service account key
source_pdf_path = "/content/Non Disclosure Agreement.pdf"
source_key_path = "/content/demystifying-legal-docs-656e8c1f99a1.json"

if os.path.exists(source_pdf_path):
    # destination_pdf_path = "/content/Non Disclosure Agreement.pdf" # This line caused the error
    # shutil.copyfile(source_pdf_path, destination_pdf_path) # This line caused the error
    print(f"✅ Sample document ready at {source_pdf_path}")
else:
    print(f"❌ Sample document not found at {source_pdf_path}. Upload it manually using the left sidebar in Colab.")

if os.path.exists(source_key_path):
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = source_key_path
    print(f"✅ GOOGLE_APPLICATION_CREDENTIALS set to {source_key_path}")
else:
    print(f"❌ Service account key not found at {source_key_path}. Upload manually in Colab.")

# Set ngrok token
NGROK_AUTHTOKEN = "31haLAEOJZKFwD6QLriY5QTjjgZ_2ABAMCxWb3Hu1iFmaNvVL"
ngrok.set_auth_token(NGROK_AUTHTOKEN)

# Connect ngrok to Streamlit port
public_url = ngrok.connect(8501)
print("🚀 Streamlit app is live at:", public_url)

# Kill any existing Streamlit processes
subprocess.run(["pkill", "streamlit"])
time.sleep(2)

# Start Streamlit app in the background
!streamlit run app.py --server.port 8501 &> /dev/null &