In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

with open("/Users/emma/msc_project/data/eur-lexsum/raw-data/train.source", "r") as f:
    legal_text = f.read().split('\n')[0]  # getting first document

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=200,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
docs = text_splitter.create_documents([legal_text])

  separators=["\n\n", "\n", "(?<=\. )", " ", ""]


In [8]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={"device": "cpu"},
    encode_kwargs={"batch_size": 4} 
)

vectorstore = FAISS.from_documents(docs, embeddings)

In [None]:
import time

!ollama pull llama3

time.sleep(30)

!ollama list

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest 
pulling 6a0746a1ec1a... 100% ▕████████████████▏ 4.7 GB                         
pulling 4fa551d4f938... 100% ▕████████████████▏  12 KB                         
pulling 8ab4849b038c... 100% ▕████████████████▏  254 B                         
pulling 577073ffcc6c... 100% ▕████████████████▏  110 B                         
pulling 3f8eb4da87fa... 100% ▕████████████████▏  485 B                         
verifying sha256 digest 
writing manifest 
removing any unused layers 
success [?25h
NAME           	ID          	SIZE  	MODIFIED       
llama3:latest  	365c0bd3c000	4.7 GB	30 seconds ago	
llama3.1:latest	62757c860e01	4.7 GB	11 months ago 	


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
from langchain_community.llms import Ollama
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

llm = Ollama(
    model="llama3",  
    temperature=0.1  
)

with open("/Users/emma/msc_project/data/eur-lexsum/raw-data/train.source", "r") as f:
    legal_text = f.readline().strip()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", "(?<=\. )", " "]
)

docs = text_splitter.create_documents(
    texts=[legal_text],
    metadatas=[{"source": "train.source"}] 
)

print(f"Created {len(docs)} document chunks")

print("\nSample chunks:")
for i, chunk in enumerate(docs[:3]):  
    print(f"\nChunk {i+1} (Length: {len(chunk.page_content)} chars):")
    print(chunk.page_content[:200] + "..." if len(chunk.page_content) > 200 else chunk.page_content)

embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={"device": "cpu"} # Or cuda (colab GPU usage ran out while testing)
)

vectorstore = FAISS.from_documents(docs, embeddings) 

  separators=["\n\n", "\n", "(?<=\. )", " "]


Created 21 document chunks

Sample chunks:

Chunk 1 (Length: 997 chars):
L_2008328EN.01005501.xml 6.12.2008 EN Official Journal of the European Union L 328/55 COUNCIL FRAMEWORK DECISION 2008/913/JHA of 28 November 2008 on combating certain forms and expressions of racism a...

Chunk 2 (Length: 997 chars):
on how best to implement the provisions of the Treaty of Amsterdam on an area of freedom, security and justice (2), the Conclusions of the Tampere European Council of 15 and 16 October 1999, the Resol...

Chunk 3 (Length: 993 chars):
as already expressed by the European Council in December 2003. (3) Council Joint Action 96/443/JHA of 15 July 1996 concerning action to combat racism and xenophobia (4) should be followed by further l...


In [14]:
def ask_legal_question(question):
    relevant_docs = vectorstore.similarity_search(
        question, 
        k=3,  
        filter={"source": "train.source"}  
    )
    context = "\n\nDOCUMENT EXCERPTS:\n" + "\n---\n".join([doc.page_content for doc in relevant_docs])
    
    prompt = f"""You are a senior EU legal analyst. Provide a complete response to the question using ONLY the provided legal document excerpts.

{context}

QUESTION: {question}

RESPONSE REQUIREMENTS:
1. Begin with "Under [Legal Instrument]" if cited in documents
2. Answer comprehensively with:
   - Key legal provisions
   - Relevant article references
   - Jurisdictional scope when applicable
3. Structure using bullet points for clarity
4. Never speculate - respond "Not specified in document" for missing information

ADDITIONAL RULES:
- Prioritize direct quotes from text
- Highlight definitions if present"""

    try:
        response = llm.invoke(prompt)
        print(f"debugging: {context}")
        print("----------")
        return response
        
    except Exception as e:
        return f"Error: {str(e)}"

In [15]:
print(ask_legal_question("Which types of offenses does this Framework Decision cover?"))

debugging: 

DOCUMENT EXCERPTS:
colour, religion, descent or national or ethnic origin. (10) This Framework Decision does not prevent a Member State from adopting provisions in national law which extend Article 1(1)(c) and (d) to crimes directed against a group of persons defined by other criteria than race, colour, religion, descent or national or ethnic origin, such as social status or political convictions. (11) It should be ensured that investigations and prosecutions of offences involving racism and xenophobia are not dependent on reports or accusations made by victims, who are often particularly vulnerable and reluctant to initiate legal proceedings. (12) Approximation of criminal law should lead to combating racist and xenophobic offences more effectively, by promoting a full and effective judicial cooperation between Member States. The difficulties which may exist in this field should be taken into account by the Council when reviewing this Framework Decision with a view to con

In [16]:
print(ask_legal_question("Does this Decision apply only within EU countries or also outside?"))

debugging: 

DOCUMENT EXCERPTS:
Member States. The difficulties which may exist in this field should be taken into account by the Council when reviewing this Framework Decision with a view to considering whether further steps in this area are necessary. (13) Since the objective of this Framework Decision, namely ensuring that racist and xenophobic offences are sanctioned in all Member States by at least a minimum level of effective, proportionate and dissuasive criminal penalties, cannot be sufficiently achieved by the Member States individually, since such rules have to be common and compatible and since this objective can therefore be better achieved at the level of the European Union, the Union may adopt measures, in accordance with the principle of subsidiarity as referred to in Article 2 of the Treaty on European Union and as set out in Article 5 of the Treaty establishing the European Community. In accordance with the principle of proportionality, as set out in the latter Article