# Building a Legal Document Assistant with LLMs

## Step 1: Initial Setup
First, we’ll install the necessary packages.

In [1]:
# Install required packages
!pip install markitdown langchain chromadb gradio
!pip install flash-attn git+https://github.com/huggingface/transformers.git triton
!pip uninstall -y bitsandbytes
!pip install -U bitsandbytes

# Import basic libraries
import os
from markitdown import MarkItDown
from transformers import AutoTokenizer, AutoModel
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-algixv28
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-algixv28
  Resolved https://github.com/huggingface/transformers.git to commit 8c1b5d37827a6691fef4b2d926f2d04fb6f5a9e3
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Found existing installation: bitsandbytes 0.45.0
Uninstalling bitsandbytes-0.45.0:
  Successfully uninstalled bitsandbytes-0.45.0
Collecting bitsandbytes
  Using cached bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Using cached bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.0




## Step 2: Set Up Google Drive Integration or local folder
This cell connects to your Google Drive to access documents. Create a folder on your Drive called “legal_documents”. Or alter the code bellow accordingly:

In [2]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')
# Create documents folder
DOCUMENTS_PATH = '/content/drive/MyDrive/Evident/legal_documents'
MARKDOWN_PATH = '/content/drive/MyDrive/Evident/markdown'

  and should_run_async(code)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Step 3: Document Processor Class
This cell defines our core document processing system:

### Improvements:
- *Adding document chunking for better context management.*

In [3]:
import chromadb

class DocumentProcessor:
    def __init__(self):
        """Initialize the document processor with necessary components."""
        # Set up embedding model
        self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.model.eval()
        # Initialize document converter
        self.md = MarkItDown()
        # Set up vector database
        self.vector_db = chromadb.Client()
        self.collection = self.vector_db.get_or_create_collection(name="legal_docs")

    def process_document(self, file_path):
        """Convert document to text and generate embeddings."""
        try:
            # Convert document to text
            conversion_result = self.md.convert(file_path)
            text_content = conversion_result.text_content
            # # Save the Markdown text to Google Drive
            # filename = os.path.basename(file_path)
            # save_path = os.path.join(MARKDOWN_PATH, filename)
            # with open(save_path, 'w', encoding='utf-8') as file:
            #     file.write(text_content)
            # print(f"Markdown file saved to: {save_path}")
            # Create embeddings
            inputs = self.tokenizer(
                text_content,
                return_tensors="pt",
                truncation=True
            )
            # Use GPU if available
            if torch.cuda.is_available():
                self.model.to('cuda')
                inputs = {k: v.to('cuda') for k, v in inputs.items()}
            # Generate embeddings
            with torch.no_grad():
                outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()
            return {
                'text': conversion_result.text_content,
                'embeddings': embeddings,
                'metadata': getattr(conversion_result, 'metadata', {})
            }
        except Exception as e:
            logger.error(f"Error processing document {file_path}: {str(e)}")
            raise

    def store_document(self, doc_id, text, embedding, metadata=None):
        """Store document in the vector database."""
        if metadata is None:
            metadata = {}
        self.collection.add(
            documents=[text],
            embeddings=[embedding],
            # metadatas=[metadata],
            ids=[doc_id]
        )

    def find_relevant_documents(self, query, n_results=3):
        """Find relevant documents for a given query."""
        results = self.collection.query(
            query_texts=[query],
            n_results=n_results
        )
        return [
            {
                'text': doc_text,
                'id': results['ids'][0][i],
                'metadata': results['metadatas'][0][i]
            }
            for i, doc_text in enumerate(results['documents'][0])
        ]
# Initialize processor
processor = DocumentProcessor()
# Move to GPU if available
if torch.cuda.is_available():
    processor.model = processor.model.to('cuda')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  from jax import xla_computation as _xla_computation


## Step 4: Process Documents
This cell processes all documents in your `legal_documents` folder:

In [4]:
import chromadb

class DocumentProcessor:
    def __init__(self):
        """Initialize the document processor with necessary components."""
        # Set up embedding model
        self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.model.eval()
        # Initialize document converter
        self.md = MarkItDown()
        # Set up vector database
        self.vector_db = chromadb.Client()
        self.collection = self.vector_db.get_or_create_collection(name="legal_docs")

#trying to add metadata
import os
from datetime import datetime

def process_document(self, file_path):
    """Convert document to text and generate embeddings."""
    try:
        # Convert document to text
        conversion_result = self.md.convert(file_path)
        text_content = conversion_result.text_content

        # Generate metadata
        metadata = {
            'filename': os.path.basename(file_path),
            'file_size': os.path.getsize(file_path),
            'last_modified': datetime.fromtimestamp(os.path.getmtime(file_path)).isoformat(),
            'processing_date': datetime.now().isoformat(),
            'tags': ['legal', 'guidelines'],  # Add any default or dynamic tags here
            'source': 'Evident Legal Documents'  # Customize as needed
        }

        # Create embeddings
        inputs = self.tokenizer(
            text_content,
            return_tensors="pt",
            truncation=True
        )
        # Use GPU if available
        if torch.cuda.is_available():
            self.model.to('cuda')
            inputs = {k: v.to('cuda') for k, v in inputs.items()}
        # Generate embeddings
        with torch.no_grad():
            outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()

        return {
            'text': text_content,
            'embeddings': embeddings,
            'metadata': metadata
        }
    except Exception as e:
        logger.error(f"Error processing document {file_path}: {str(e)}")
        raise


  and should_run_async(code)


In [5]:
# Get list of documents
document_files = [
    f for f in os.listdir(DOCUMENTS_PATH)
    if f.endswith(('.pdf', '.docx', '.txt', '.html', '.pptx'))
]
if not document_files:
    print("⚠️ No documents found! Add some to your legal_documents folder")
else:
    print(f"Found {len(document_files)} documents to process")
    for idx, document in enumerate(document_files):
        try:
            print(f"Processing {document}...")
            file_path = os.path.join(DOCUMENTS_PATH, document)
            # Process document
            result = processor.process_document(file_path)
            # Store in database
            doc_id = f"doc_{idx}_{document}"
            processor.store_document(
                doc_id=doc_id,
                text=result['text'],
                embedding=result['embeddings'],
                metadata=result['metadata']
            )
            print(f"✅ Finished storing {document} in Chroma\n")
        except Exception as e:
            print(f"Error processing {document}: {str(e)}")

Found 6 documents to process
Processing leiden_guidelines.pdf...
✅ Finished storing leiden_guidelines.pdf in Chroma

Processing extrapolations.pdf...
✅ Finished storing extrapolations.pdf in Chroma

Processing case_summaries.pdf...
✅ Finished storing case_summaries.pdf in Chroma

Processing dde_national_courts.pdf...
✅ Finished storing dde_national_courts.pdf in Chroma

Processing dde_un_human_rights.pdf...
✅ Finished storing dde_un_human_rights.pdf in Chroma

Processing dde_international_criminal_law.pdf...
✅ Finished storing dde_international_criminal_law.pdf in Chroma



# Step 5: Set Up LLaMA Model
Go to HuggingFace and search for the LLaMA model you want to use. For example, 3.1. Request permission to use it and get a HuggingFace token.

This cell initializes the LLaMA model for generating responses:

In [6]:
# HF_TOKEN = "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"
# from transformers import AutoModelForCausalLM, AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained(
#     "meta-llama/Llama-3.1-8B",
#     token=HF_TOKEN
# )
# model = AutoModelForCausalLM.from_pretrained(
#     "meta-llama/Llama-3.1-8B",
#     token=HF_TOKEN
# )

In [7]:
HF_TOKEN = "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"
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gizembrasser/FineLlama-3.1-8B", use_auth_token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained("gizembrasser/FineLlama-3.1-8B", use_auth_token=HF_TOKEN)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

# Step 6: Question-Answering Function
This cell defines the function that generates answers using LLaMA.

### Improvements:
- *Customizing the prompt template for better responses.*
- *Adding memory to maintain conversation context.*

In [8]:
def ask_question_llama(question):
    """Generate an answer to a legal question using LLaMA."""
    # Get relevant documents
    relevant_docs = processor.find_relevant_documents(question, n_results=3)
    # Prepare context
    context_pieces = [doc['text'][:1000] for doc in relevant_docs]
    truncated_context = "\n".join(context_pieces)
    # Create prompt
    full_prompt = f"""You are an AI assistant providing support with the interpretation of the Leiden Guidelines. You are given some relevant excerpts from user documents below, followed by a question.
The answer provided will never be comprehensive because of the limitations of the sources you rely on, make sure that the user is always made aware of this. Don't give definitive answers.
--- Document Excerpts ---
{truncated_context}
--- Question ---
{question}
Answer:
"""
    # Prepare for generation
    if torch.cuda.is_available():
        model.to('cuda')
    inputs = tokenizer(
        full_prompt,
        return_tensors="pt",
        max_length=1024,
        truncation=True
    )
    # Remove token_type_ids if present
    if "token_type_ids" in inputs:
        del inputs["token_type_ids"]
    if torch.cuda.is_available():
        inputs = {k: v.to('cuda') for k, v in inputs.items()}
    # Generate answer
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=0.7,
            do_sample=True
        )
    # Process output
    raw_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "Answer:" in raw_output:
        final_answer = raw_output.split("Answer:", 1)[1].strip()
    else:
        final_answer = raw_output
    return final_answer

In [9]:
print(ask_question_llama("can i submit a piece of video evidence to an international court?"))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


1.0    Introduction

2.0    The Use of DDE in International Criminal Courts and Tribunals

2.1    The Admissibility of DDE as Evidence in International Criminal Courts and Tribunals

2.2    The Weight and Reliability of DDE as Evidence in International Criminal Courts and Tribunals

2.3    The Evidentiary Value of DDE in International Criminal Courts and Tribunals

3.0    Conclusions

3.1    The Use of DDE in International Criminal Courts and Tribunals

3.2    The Admissibility of DDE as Evidence in International Criminal Courts and Tribunals

3.3    The Weight and Reliability of DDE as Evidence in International Criminal Courts and Tribunals

3.4    The Evidentiary Value of DDE in International Criminal Courts and Tribunals

4.0    Annexes

4.1    Annex 1: Bibliography

4.2    Annex 2: Excerpts from User Documents

4.3    Annex 3: Case Summaries

4.4    Annex 4: List of Acronyms

4.5    Annex 5: List of Authorities

4.6    Annex 6: List of References

4.7    Annex 7: List of Authors

4

# Step 7: Create User Interface
Finally, we can also create the Gradio interface:

In [None]:
import gradio as gr

def create_interface():
    def mock_ask_question_llama(question):
        return "This is a mock answer. Replace this function with your actual implementation."

    # Define the interface
    with gr.Blocks(css="""
        #submit-btn {
            background-color: #003366;
            color: white;
            border: none;
            padding: 10px 20px;
            font-size: 16px;
            border-radius: 5px;
            cursor: pointer;
            width: 150px; /* Adjusted width */
            transition: background-color 0.1s ease; /* Smooth transition for color change */
        }
        #submit-btn:active {
            background-color: #0055cc; /* Change color when button is clicked */
        }
        .title {
            font-size: 50px;
            text-align: center;
            margin-bottom: 20px;
        }
        #custom-textbox {
            border: 10px solid black; /* Thicker black border */
            padding: 10px; /* Add some padding inside the textbox */
            border-radius: 5px; /* Optional: Rounded corners */
        }
        #textbox_id textarea {
            background-color: #f0f0f0; /* Set background color to grey */
            color: black; /* Text color for better contrast */
            font-size: 16px; /* Adjust font size for better readability */
        }
        .footer {
            text-align: center;
            font-size: 14px;
            margin-top: 30px;
        }
        """) as demo:

        gr.Markdown("<div class='title'>EVIDENT 🔍</div>", elem_id="title")
        gr.Markdown(
            """
            **DISCLAIMER:**
            The Leiden Guidelines is based on limited authoritative non-binding precedents of international criminal courts and tribunals.
            The Leiden Guidelines is not updated to the most recent case law of the courts and tribunals.
            Therefore, it is not a binding legal document and it is not comprehensive.
            """
        )
        gr.Row()  # Add a spacer for extra room
        with gr.Row():
            question_box = gr.Textbox(
                label="Your Legal Question",
                placeholder="Ask any question about the Leiden Guidelines...",
                lines=5,
                elem_id="textbox_id"
            )
        with gr.Row():
            submit_button = gr.Button("Ask", elem_id="submit-btn")  # Assign elem_id
        with gr.Row():
            answer_box = gr.Textbox(
                label="Answer",
                lines=10,
                show_copy_button=True,
                elem_id="textbox_id"
            )

        submit_button.click(ask_question_llama, inputs=question_box, outputs=answer_box)

        # Footer
        gr.Markdown(
            """
            <div class="footer">
            <b>For more information visit:</b>
            <a href="https://leiden-guidelines.netlify.app" target="_blank">
            https://leiden-guidelines.netlify.app
            </a>
            </div>
            """
        )

    return demo

# Launch interface
demo = create_interface()
demo.launch(share=True, debug=True)

  from websockets.server import WebSocketServerProtocol


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://fd87c040cdde1b5f6c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

In [None]:
# import gradio as gr

# def create_interface():
#     demo = gr.Interface(
#         fn=ask_question_llama,
#         inputs=[
#             gr.Textbox(
#                 label="Your Legal Question",
#                 placeholder="Ask any question about the Leiden Guidelines...",
#                 lines=3
#             )
#         ],
#         outputs=[
#             gr.Textbox(
#                 label="Answer",
#                 lines=10
#             )
#         ],
#         title="Evident (not legal advice)",
#         description="DISCLAIMER: The Leiden Guidelines is based on limited authoritative non-binding precedents of international criminal courts and tribunals. The Leiden Guidelines is not updated to the most recent case law of the courts and tribunals. Therefore it is not a binding  legal document and it is not comprehensive."
#     )
#     return demo
# # Launch interface
# demo = create_interface()
# demo.launch(share=True)