# AALM — Data Preparation with semchunk
Build an Administrative Law–focused dataset from the Open Australian Legal Corpus using `semchunk`.

This notebook filters the corpus for Administrative Law material by keywords (tribunals, judicial review concepts, FOI, etc.),
chunks the texts with `semchunk`, and saves a `text`-only dataset for SFT/QLoRA.

In [None]:
%pip -q install -U datasets semchunk transformers tiktoken

In [None]:
import os, re, math, itertools
from typing import Dict, Any, List
from datasets import load_dataset, Dataset
import semchunk
from transformers import AutoTokenizer

CORPUS_DATASET = os.environ.get('CORPUS_DATASET', 'isaacus/open-australian-legal-corpus')
CORPUS_SPLIT = os.environ.get('CORPUS_SPLIT', 'corpus')
OUTPUT_DIR = os.environ.get('OUTPUT_DIR', 'data/aalm-adminlaw-semchunk')
BASE_TOKENIZER = os.environ.get('BASE_TOKENIZER', 'openai/gpt-oss-20b')  # used for token counting
CHUNK_SIZE = int(os.environ.get('CHUNK_SIZE', '1024'))
OVERLAP = float(os.environ.get('OVERLAP', '0.2'))
DOC_LIMIT = int(os.environ.get('DOC_LIMIT', '0'))  # 0 = no explicit cap; use for quick dry runs

# Broad Administrative Law signal via keywords (case-insensitive)
ADMIN_KEYWORDS = [
    'administrative appeals tribunal', 'aat', 'administrative decisions tribunal',
    'civil and administrative tribunal', 'ncat', 'vcat', 'qcat', 'acat',
    'merits review', 'judicial review', 'procedural fairness', 'natural justice',
    'jurisdictional error', 'wednesbury', 'unreasonableness',
    'freedom of information', 'foi', 'ombudsman',
    'delegate', 'delegated legislation', 'minister', 'review of decision',
    'administrative arrangement'
]
KW = re.compile('|'.join(re.escape(k) for k in ADMIN_KEYWORDS), flags=re.I)

def is_admin_law(record: Dict[str, Any]) -> bool:
    citation = (record.get('citation') or '')
    text = (record.get('text') or '')
    return bool(KW.search(citation + '\n' + text))

tokenizer = AutoTokenizer.from_pretrained(BASE_TOKENIZER, use_fast=True)
chunker = semchunk.chunkerify(tokenizer, chunk_size=min(CHUNK_SIZE, getattr(tokenizer, 'model_max_length', CHUNK_SIZE)))
print('Using tokenizer:', BASE_TOKENIZER)
print('Chunk size:', CHUNK_SIZE, 'Overlap:', OVERLAP)


## Load and filter the Corpus

In [None]:
corpus = load_dataset(CORPUS_DATASET, split=CORPUS_SPLIT, keep_in_memory=False)
print('Total documents in split:', len(corpus))

admin_docs = []
count = 0
for ex in corpus:
    if not ex.get('text'):
        continue
    if is_admin_law(ex):
        admin_docs.append(ex)
        count += 1
        if DOC_LIMIT and count >= DOC_LIMIT:
            break
print('Matched Administrative Law docs:', len(admin_docs))


## Chunk texts with semchunk

In [None]:
texts: List[str] = []
citations: List[str] = []
jurisdictions: List[str] = []
types: List[str] = []
urls: List[str] = []

for ex in admin_docs:
    chunks = chunker(ex['text'], overlap=OVERLAP)
    n = len(chunks)
    texts.extend(chunks)
    citations.extend([ex.get('citation') or ''] * n)
    jurisdictions.extend([ex.get('jurisdiction') or ''] * n)
    types.extend([ex.get('type') or ''] * n)
    urls.extend([ex.get('url') or ''] * n)

print('Total chunks:', len(texts))


## Save dataset to disk

In [None]:
out = Dataset.from_dict({
    'text': texts,
    'citation': citations,
    'jurisdiction': jurisdictions,
    'type': types,
    'url': urls,
})
os.makedirs(OUTPUT_DIR, exist_ok=True)
out.save_to_disk(OUTPUT_DIR)
print('Saved to', OUTPUT_DIR)
out[:2]
