In [None]:
#!pip install -U datasets
!pip install python-terrier
#!pip install sentence_transformers pandas

In [None]:
import pyterrier as pt
import datasets
import pandas as pd
from pathlib import Path
import re
dataset = datasets.load_dataset("jonathanli/eurlex")

In [2]:
# RRF - Reciprocal Rank Fusion
def rrf(dfs, i=1, K=100):
    scores = {}

    for df in dfs:
        for _, row in df.iterrows():
            docno = row["docno"]
            rrf_score = (1 / (i+row["rank"]))
            if docno in scores:
                scores[docno] += rrf_score
            else:
                scores[docno] = rrf_score
    # main_qid is used here to evaluate performance of merged data frame
    merged_df = pd.DataFrame(
        [{"qid": '1', "docno": k, "score": v} for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)]
    )
    # print(merged_df[merged_df["docno"]==244319])
    merged_df["rank"] = list(range(len(merged_df)))
    if K>len(merged_df):
        K = len(merged_df)
    return merged_df[:K]

In [3]:
# Combine all parts of the dataset into one
ds1 = dataset['train'].to_pandas()
ds2 = dataset['test'].to_pandas()
ds3 = dataset['validation'].to_pandas()
ds4 = pd.concat([ds1, ds2], axis=0)
pd_ds = pd.concat([ds4, ds3], axis=0)
pd_ds

Unnamed: 0,celex_id,title,text,eurovoc_concepts
0,32014R0727,Commission Implementing Regulation (EU) No 727...,1.7.2014 EN Official Journal of the European U...,"[1402, 2771, 3191, 5055, 519, 5969, 5971]"
1,31975R2481,Regulation (EEC) No 2481/75 of the Council of ...,REGULATION (EEC) No 2481/75 OF THE COUNCIL of...,"[2319, 2713, 2938, 693]"
2,32010D0008,"2010/8/EU, Euratom: Commission Decision of 22 ...",7.1.2010 EN Official Journal of the European U...,"[3560, 365, 4256, 4261, 4353, 4585]"
3,31982D0211,82/211/EEC: Commission Decision of 17 March 19...,COMMISSION DECISION\nof 17 March 1982\nestabl...,"[1091, 3842, 3874, 4110, 4381, 5287]"
4,31996D0084,"96/84/Euratom, ECSC, EC: Commission Decision o...",COMMISSION DECISION of 8 January 1996 adjustin...,"[1026, 1048, 2300, 3653, 4271, 4390]"
...,...,...,...,...
5995,32007R0522,Commission Regulation (EC) No 522/2007 of 11 M...,12.5.2007 EN Official Journal of the European ...,"[20, 2644, 2664, 2681, 3070, 4860]"
5996,32005R0245,Commission Regulation (EC) No 245/2005 of 11 F...,12.2.2005 EN Official Journal of the European ...,"[1486, 20, 2681, 2958]"
5997,31995D0380,95/380/EC: Commission Decision of 18 September...,COMMISSION DECISION of 18 September 1995 amen...,"[1895, 2711, 4057, 4257, 5962]"
5998,31989R1200,Commission Regulation (EEC) No 1200/89 of 3 Ma...,COMMISSION REGULATION (EEC) No 1200/89\nof 3 ...,"[1667, 2668, 3275, 5096]"


In [None]:
# Create index for dataset text
index_ref = None
cache_dir = Path("cache/")
index_dir = cache_dir / "indices" / "eur_lex"

pd_ds_rename = pd_ds.rename(columns={'celex_id': 'docno'}, inplace=False)

pd_ds_dict = pd_ds_rename.to_dict(orient='records')

try:
    index_ref = pt.IndexFactory.of(str(index_dir.absolute()))
except:
    indexer = pt.index.IterDictIndexer(str(index_dir.absolute()))
    index_ref = indexer.index(
        pd_ds_dict
    )

In [5]:
# Create index for dataset titles
index_ref_title = None
cache_dir = Path("cache/")
index_dir2 = cache_dir / "indices" / "eur_lex_titles"

pd_ds_rename = pd_ds.rename(columns={'celex_id': 'docno', 'text':'not_text', 'title':'text'}, inplace=False)

pd_ds_dict = pd_ds_rename.to_dict(orient='records')

try:
    index_ref_title = pt.IndexFactory.of(str(index_dir2.absolute()))
except:
    indexer_title = pt.index.IterDictIndexer(str(index_dir2.absolute()))
    index_ref_title = indexer_title.index(
        pd_ds_dict
    )

In [6]:
# BM25 IR models for text and title of dataset documents
bm25_text = pt.terrier.Retriever(index_ref, wmodel="BM25")
bm25_title = pt.terrier.Retriever(index_ref_title, wmodel="BM25")

In [7]:
# retrieves relevant documents given both text and title
#  performs RRF to give single combined result of top-K documents
def get_text(row):
    return list(pd_ds[pd_ds['celex_id']==row['docno']]['text'])[0]

def get_title(row):
    return list(pd_ds[pd_ds['celex_id']==row['docno']]['title'])[0]

def retrieve_docs(query, K=10):
    re_query = re.sub(r'[^A-Za-z0-9\s]', '', query)
    retr_text = bm25_text.search(re_query)
    retr_title = bm25_title.search(re_query)
    results = rrf([retr_text, retr_title], K=K)
    results['title'] = results.apply(get_title, axis=1, raw=False)
    results['text'] = results.apply(get_text, axis=1, raw=False)
    return results

In [8]:
retrieve_docs('Journal')

Unnamed: 0,qid,docno,score,rank,title,text
0,1,32013R0216,2.0,0,Council Regulation (EU) No 216/2013 of 7 March...,13.3.2013 EN Official Journal of the European ...
1,1,31988L0665,0.75,1,Council Directive 88/665/EEC of 21 December 19...,COUNCIL DIRECTIVE of 21 December 1988 amending...
2,1,32012R0623,0.5,2,Commission Regulation (EU) No 623/2012 of 11 J...,12.7.2012 EN Official Journal of the European ...
3,1,32011D0479,0.333333,3,2011/479/: Commission Decision of 27 July 2011...,29.7.2011 EN Official Journal of the European ...
4,1,32006D0178,0.333333,4,2006/178/EC: Commission Decision of 27 Februa...,4.3.2006 EN Official Journal of the European U...
5,1,32006D0514,0.252427,5,2006/514/EC: Commission Decision of 20 July 2...,22.7.2006 EN Official Journal of the European ...
6,1,32011D0196,0.211111,6,2011/196/EU: Commission Decision of 29 March 2...,30.3.2011 EN Official Journal of the European ...
7,1,32005D0718,0.202421,7,2005/718/EC: Commission Decision of 13 October...,15.10.2005 EN Official Journal of the European...
8,1,32005R2156,0.2,8,Commission Regulation (EC) No 2156/2005 of 23...,24.12.2005 EN Official Journal of the European...
9,1,32001D0524,0.167832,9,2001/524/EC: Commission Decision of 28 June 20...,Commission Decision\nof 28 June 2001\nrelating...


Module 4

In [13]:
import re
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Lightweight sentence splitter
def split_into_sentences(text):
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text.strip())
    return [s.strip() for s in sentences if s]

# Create sentence chunks (1 sentence per chunk)
def create_sentence_chunks(sentences, chunk_size=1):
    return [" ".join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), chunk_size)]

# Extract most relevant sentence using SBERT
def extract_relevant_part(text, query, top_k=1):
    sentences = split_into_sentences(text)
    chunks = create_sentence_chunks(sentences, chunk_size=1)

    if not chunks:
        return ""

    query_embedding = model.encode(query, convert_to_tensor=True)
    chunk_embeddings = model.encode(chunks, convert_to_tensor=True)
    similarities = util.pytorch_cos_sim(query_embedding, chunk_embeddings)[0]
    top_indices = similarities.argsort(descending=True)[:top_k]
    top_chunks = [chunks[i] for i in top_indices]

    return " ".join(dict.fromkeys(top_chunks))  # remove duplicates

# 🧠 Main function
def get_retrieved_docs(query,query_k=10):
    df = retrieve_docs(query,K=query_k)

    output_lines = []
    for i, row in df.iterrows():
        relevant = extract_relevant_part(row['text'], query, top_k=1)
        if relevant:
            entry = f"{i+1}. {row['title'].strip()}\n{relevant.strip()}"
            output_lines.append(entry)

    full_output_string = "\n\n".join(output_lines)
    return full_output_string

# Example usage
result = get_retrieved_docs("Journal",query_k=5)
print(result)


1. Council Regulation (EU) No 216/2013 of 7 March 2013 on the electronic publication of the Official Journal of the European Union
For the purposes of ensuring the authenticity, integrity and inalterability of the electronic edition of the Official Journal, an advanced electronic signature based on a qualified certificate and created by a secure-signature-creation device in accordance with that Directive provides sufficient guarantees to the public.

2. Council Directive 88/665/EEC of 21 December 1988 amending several Directives concerning the approximation of the laws of Member States where there is provision in those Directives for publication in the Official Journal of the European Communities of attestations and certificates
COUNCIL DIRECTIVE of 21 December 1988 amending several Directives concerning the approximation of the laws of Member States where there is provision in those Directives for publication in the Official, Journal of the European Communities of  attestations and ce