In [None]:
#!pip install -U datasets
#!pip install python-terrier
#!pip install sentence_transformers pandas

Module 4

In [24]:
import re
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import os

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')
query = ""

with open("input/query.txt", 'r', encoding='utf-8') as f:
        query = f.read().strip()

# Lightweight sentence splitter
def split_into_sentences(text):
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text.strip())
    return [s.strip() for s in sentences if s]

# Create sentence chunks (1 sentence per chunk)
def create_sentence_chunks(sentences, chunk_size=1):
    return [" ".join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), chunk_size)]

# Extract most relevant sentence using SBERT
def extract_relevant_part(text, query, top_k=1):
    sentences = split_into_sentences(text)
    chunks = create_sentence_chunks(sentences, chunk_size=1)

    if not chunks:
        return ""

    query_embedding = model.encode(query, convert_to_tensor=True)
    chunk_embeddings = model.encode(chunks, convert_to_tensor=True)
    similarities = util.pytorch_cos_sim(query_embedding, chunk_embeddings)[0]
    top_indices = similarities.argsort(descending=True)[:top_k]
    top_chunks = [chunks[i] for i in top_indices]

    return " ".join(dict.fromkeys(top_chunks))  # remove duplicates

# 🧠 Main function
def get_retrieved_docs(query,csv_path="input/lawsToBeConsidered.csv"):
    df = pd.read_csv(csv_path)

    output_lines = []
    for i, row in df.iterrows():
        relevant = extract_relevant_part(row['text'], query, top_k=1)
        if relevant:
            entry = f"{i+1}. {row['title'].strip()}\n{relevant.strip()}"
            output_lines.append(entry)

    full_output_string = "\n\n".join(output_lines)
    os.makedirs("output", exist_ok=True)
    with open("output/summarized_laws.txt", 'w', encoding='utf-8') as f:
        f.write(full_output_string)
    return full_output_string

# Example usage
result = get_retrieved_docs(query)
print(result)

1. Council Regulation (EU) No 216/2013 of 7 March 2013 on the electronic publication of the Official Journal of the European Union
For the purposes of ensuring the authenticity, integrity and inalterability of the electronic edition of the Official Journal, an advanced electronic signature based on a qualified certificate and created by a secure-signature-creation device in accordance with that Directive provides sufficient guarantees to the public.

2. Council Directive 88/665/EEC of 21 December 1988 amending several Directives concerning the approximation of the laws of Member States where there is provision in those Directives for publication in the Official Journal of the European Communities of attestations and certificates
COUNCIL DIRECTIVE of 21 December 1988 amending several Directives concerning the approximation of the laws of Member States where there is provision in those Directives for publication in the Official, Journal of the European Communities of  attestations and ce