## 0. Import dependencies

In [1]:
import os
import sys

import pandas as pd


# Determine the project root directory for relative imports
try:
    # This will work in scripts where __file__ is defined
    current_dir = os.path.dirname(os.path.abspath(__file__))
    # Assuming "src" is parallel to the script folder
    project_root = os.path.abspath(os.path.join(current_dir, ".."))
except NameError:
    # In notebooks __file__ is not defined: assume we're in notebooks/riziv_dataset/
    project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))

src_path = os.path.join(project_root, "src")
if src_path not in sys.path:
    sys.path.append(src_path)

# Local application imports

In [2]:
citation="Art. 12/1, Code Bruxellois de l'Aménagement du Territoire (Titre Ier, Chapitre VI)"

In [4]:
prompt = """You are a data‑cleaning assistant specialised in Belgian / French legal citations.

## TASK
For every *single* input string you receive, extract the following fields:

• "Article"  – everything **before** the first comma.  
• "Act"      – everything **after** the first comma but **before** the first "(" .  
• "Book"     – full substring that begins with "Livre" (or "Book") inside the parentheses and ends with the numeral/letter that names the book.  
• "Title"    – full substring that begins with "Titre" (or "Title") inside the parentheses and ends with the numeral/letter that names the title.  
• "Chapter"  – full substring that begins with "Chapitre" (or "Chapter") and ends with the numeral/letter that names the chapter.  
• "Section"  – full substring that begins with "Section" and ends with the numeral/letter that names the section.

If a field is missing in the input, output `null` (not the string "null").

## OUTPUT
Return **one** line of pure JSON, with this exact schema and key order:

{
  "Article": <string>,
  "Act": <string>,
  "Book": <string | null>,
  "Title": <string | null>,
  "Chapter": <string | null>,
  "Section": <string | null>
}

Do not add any surrounding markdown, explanation, or additional keys.

## EXAMPLES
### Example 1  
Input:  
Art. 2043quinquies, Code Civil (Titre XIV, Chapitre V)

Output:
{
  "Article": "Art. 2043quinquies",
  "Act": "Code Civil",
  "Book": null,
  "Title": "Titre XIV",
  "Chapter": "Chapitre V",
  "Section": null
}

### Example 2  
Input:  
Art. 64/1, Code Bruxellois de l'Aménagement du Territoire (Titre II, Chapitre V, Section VII)

Output:
{
  "Article": "Art. 64/1",
  "Act": "Code Bruxellois de l'Aménagement du Territoire",
  "Book": null,
  "Title": "Titre II",
  "Chapter": "Chapitre V",
  "Section": "Section VII"
}

### Example 3  
Input:  
Art. 2.2.2, Code Bruxellois de l'Air, du Climat et de la Maîtrise de l'Energie (Livre 2, Titre 2, Chapitre 1er, Section 2)

Output:
{
  "Article": "Art. 2.2.2",
  "Act": "Code Bruxellois de l'Air, du Climat et de la Maîtrise de l'Energie",
  "Book": "Livre 2",
  "Title": "Titre 2",
  "Chapter": "Chapitre 1er",
  "Section": "Section 2"
}

## EDGE RULES
* Preserve accents and punctuation inside values exactly as they appear.  
* Ignore spaces before/after commas and parentheses when extracting.  
* Numerals can be Roman (XIV) or Arabic (2). Keep whatever form appears.  
* Never invent data that is not present.

Begin when ready – remember: one JSON object, no commentary.
"""



In [5]:
payload = {
  "model": "gemma3:1b-it-fp16",
  "messages": [
    {
      "role": "system",
      "content": (
        "You are a data‑cleaning assistant. "
        "Always respond with exactly one JSON object that conforms to the schema "
        "provided in the `format` parameter. Do not output anything else."
      )
    },
    {
      # `prompt` is the multiline string we created earlier
      "role": "user",
      "content": prompt + (f"## INPUT to analyze: {citation} ")
    }
  ],
  "stream": False,
  "format": {
    "type": "object",
    "properties": {
      "Article":  { "type": ["string", "null"] },
      "Act":      { "type": ["string", "null"] },
      "Book":     { "type": ["string", "null"] },
      "Title":    { "type": ["string", "null"] },
      "Chapter":  { "type": ["string", "null"] },
      "Section":  { "type": ["string", "null"] }
    },
    "required": [
      "Article",
      "Act",
      "Book",
      "Title",
      "Chapter",
      "Section"
    ]
  }
}


In [11]:
#http://localhost:11434

import requests
from pydantic import BaseModel

# class ArticleData(BaseModel):

    

url = f"http://localhost:11434/api/chat"
payload = payload

try:
    response = requests.post(url, json=payload)
    response.raise_for_status()  # Raise an error for HTTP request failures
    # return response.json()
except requests.exceptions.RequestException as e:
    print(f"Error fetching embedding: {e}")
    # return None

In [None]:
"Art. 12/1, Code Bruxellois de l'Aménagement du Territoire (Titre Ier, Chapitre VI)"

In [12]:
response.json()

{'model': 'gemma3:1b-it-fp16',
 'created_at': '2025-04-18T17:46:51.691135082Z',
 'message': {'role': 'assistant',
  'content': '{"Article": "Art. 12/1", "Act": "Code Bruxellois de l\'Aménagement du Territore", "Book": "Livre Ier", "Title": "Titre Ier", "Chapter": "Chapitre VI", "Section": null}'},
 'done_reason': 'stop',
 'done': True,
 'total_duration': 861403175,
 'load_duration': 29834429,
 'prompt_eval_count': 870,
 'prompt_eval_duration': 134140925,
 'eval_count': 64,
 'eval_duration': 402537894}

## 1. Load data

In [2]:
# Define the path to the BSARD dataset files
BSARD_data_path = os.path.join(project_root, "data", "BSARD_dataset")

bsard_corpus = pd.read_csv(os.path.join(BSARD_data_path, 'bsard_corpus.csv'))

In [5]:
bsard_corpus.head(3)

Unnamed: 0,id,reference,article,law_type,code,book,part,act,chapter,section,subsection,description
0,1,"Art. 1.1.1, Code Bruxellois de l'Air, du Clima...",Le présent Code règle une matière visée à l'ar...,regional,"Code Bruxellois de l'Air, du Climat et de la M...",Dispositions communes,,Généralités,,,,"Dispositions communes, Généralités"
1,2,"Art. 1.1.2, Code Bruxellois de l'Air, du Clima...",Le présent Code transpose en Région de Bruxell...,regional,"Code Bruxellois de l'Air, du Climat et de la M...",Dispositions communes,,Généralités,,,,"Dispositions communes, Généralités"
2,3,"Art. 1.2.1, Code Bruxellois de l'Air, du Clima...",Le présent Code poursuit les objectifs suivant...,regional,"Code Bruxellois de l'Air, du Climat et de la M...",Dispositions communes,,Objectifs,,,,"Dispositions communes, Objectifs"


In [7]:
bsard_corpus["code"].value_counts()

code
Code Réglementaire Wallon de l'Action sociale et de la Santé                 2618
Code Judiciaire                                                              2285
Code de Droit Economique                                                     2032
Code Civil                                                                   1961
Code du Bien-être au Travail                                                 1287
Code des Sociétés et des Associations                                        1194
Code de la Démocratie Locale et de la Décentralisation                       1159
Code Wallon de l'Action sociale et de la Santé                               1032
Code de la Navigation                                                         977
Code de l'Eau intégré au Code Wallon de l'Environnement                       902
Code Wallon du Développement Territorial                                      796
Code d'Instruction Criminelle                                                 719
Code Pénal 

In [15]:
bsard_corpus.iloc[1]

id                                                             2
reference      Art. 1.1.2, Code Bruxellois de l'Air, du Clima...
article        Le présent Code transpose en Région de Bruxell...
law_type                                                regional
code           Code Bruxellois de l'Air, du Climat et de la M...
book                                       Dispositions communes
part                                                         NaN
act                                                  Généralités
chapter                                                      NaN
section                                                      NaN
subsection                                                   NaN
description                   Dispositions communes, Généralités
Name: 1, dtype: object

In [31]:
bsard_corpus.iloc[9999]['reference']

"Art. 47, Code Wallon de l'Action sociale et de la Santé (Livre IV, Titre IV)"

In [9]:
bsard_corpus["chapter"].value_counts()

chapter
Agrément                                                                                                          256
Dispositions générales                                                                                            228
Hôpitaux psychiatriques                                                                                           206
Subventionnement                                                                                                  198
Les preuves.                                                                                                      155
                                                                                                                 ... 
De la mutation temporaire.                                                                                          1
De la permutation                                                                                                   1
De la mutation.                                 

## 2.

In [None]:
bsard_corpus

## 3.