In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

with open("/Users/emma/msc_project/data/eur-lexsum/raw-data/train.source", "r") as f:
    legal_text = f.read().split('\n')[0]  # getting first document

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=200,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
docs = text_splitter.create_documents([legal_text])

  separators=["\n\n", "\n", "(?<=\. )", " ", ""]


In [7]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={"device": "cpu"},
    encode_kwargs={"batch_size": 4} 
)

vectorstore = FAISS.from_documents(docs, embeddings)

KeyboardInterrupt: 

In [None]:
import time

!ollama serve > /dev/null 2>&1 &

!ollama pull llama3

# Wait for model to load
time.sleep(30)

!ollama list

In [None]:
from langchain_community.llms import Ollama
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

llm = Ollama(
    model="llama3",  
    temperature=0.1  
)

with open("./train.source", "r") as f:
    legal_text = f.readline().strip()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", "(?<=\. )", " "]
)

docs = text_splitter.create_documents(
    texts=[legal_text],
    metadatas=[{"source": "train.source"}] 
)

print(f"Created {len(docs)} document chunks")

print("\nSample chunks:")
for i, chunk in enumerate(docs[:3]):  
    print(f"\nChunk {i+1} (Length: {len(chunk.page_content)} chars):")
    print(chunk.page_content[:200] + "..." if len(chunk.page_content) > 200 else chunk.page_content)

embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={"device": "cpu"} # Or cuda (colab GPU usage ran out while testing)
)

vectorstore = FAISS.from_documents(docs, embeddings) 

In [None]:
def ask_legal_question(question):
    relevant_docs = vectorstore.similarity_search(
        question, 
        k=3,  
        filter={"source": "train.source"}  
    )
    context = "\n\nDOCUMENT EXCERPTS:\n" + "\n---\n".join([doc.page_content for doc in relevant_docs])
    
    prompt = f"""You are a senior EU legal analyst. Provide a complete response to the question using ONLY the provided legal document excerpts.

{context}

QUESTION: {question}

RESPONSE REQUIREMENTS:
1. Begin with "Under [Legal Instrument]" if cited in documents
2. Answer comprehensively with:
   - Key legal provisions
   - Relevant article references
   - Jurisdictional scope when applicable
3. Structure using bullet points for clarity
4. Never speculate - respond "Not specified in document" for missing information

ADDITIONAL RULES:
- Prioritize direct quotes from text
- Highlight definitions if present"""

    try:
        response = llm.invoke(prompt)
        print(f"debugging: {context}")
        print("----------")
        return response
        
    except Exception as e:
        return f"Error: {str(e)}"

In [None]:
print(ask_legal_question("Which types of offenses does this Framework Decision cover?"))

In [None]:
print(ask_legal_question("Does this Decision apply only within EU countries or also outside?"))