In [2]:
import json
import os
import nltk
from nltk import sent_tokenize

In [3]:
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [21]:
INPUT_FILE = "iso27002_extracted.json"
OUTPUT_FILE = "iso27002_chunked_100.json"

In [22]:
if not os.path.exists(INPUT_FILE):
    raise FileNotFoundError(f"Input file '{INPUT_FILE}' not found in current directory.")

with open(INPUT_FILE, "r", encoding="utf-8") as f:
    iso_data = json.load(f)

In [23]:
chunked_data = {}

In [24]:
for clause_id,content in iso_data.items():
  clause_name = content.get("Clause Name","").strip()
  control = content.get("Control","").strip()
  purpose = content.get("Purpose","").strip()
  guidance = content.get("Guidance","").strip()

  full_text = " ".join([control, purpose, guidance]).strip()

  sentences = sent_tokenize(full_text)

  num_chunks = min(100,len(sentences))
  chunk_size = max(1,len(sentences)//num_chunks)

  chunks = [
        " ".join(sentences[i:i + chunk_size])
        for i in range(0, len(sentences), chunk_size)
    ]

  chunked_data[clause_id] = {
        "Clause Name": clause_name,
        "Chunks": chunks
    }


In [26]:
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(chunked_data, f, indent=4)

print(f"\n✅ Saved chunked control data to: {OUTPUT_FILE}")



✅ Saved chunked control data to: iso27002_chunked_100.json
