In [5]:
import os
print(" Current working directory:", os.getcwd())

 Current working directory: C:\Users\DELL\Desktop\Horizon17\sopcompliance


In [6]:
import fitz  # PyMuPDF
import re
import json
from collections import Counter
import nltk
from nltk.tokenize import sent_tokenize

In [7]:
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [8]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    all_text = []
    headers, footers = [], []

    for page in doc:
        lines = page.get_text().split('\n')
        if lines:
            headers.append(lines[0])
            footers.append(lines[-1])
        all_text.append('\n'.join(lines))

    header_to_remove = Counter(headers).most_common(1)[0][0]
    footer_to_remove = Counter(footers).most_common(1)[0][0]

    cleaned_pages = []
    for page_text in all_text:
        lines = page_text.split('\n')
        if lines[0] == header_to_remove:
            lines = lines[1:]
        if lines and lines[-1] == footer_to_remove:
            lines = lines[:-1]
        cleaned_pages.append('\n'.join(lines))

    full_clean_text = '\n'.join(cleaned_pages)
    match = re.search(
        r"5\s+Organizational controls(.*?)9\s+(Annex|Reference control set)",
        full_clean_text,
        re.DOTALL | re.IGNORECASE
    )
    if not match:
        return ""

    extracted_text = match.group(1)
    extracted_text = re.sub(r"[^\w\s.,:;!?()\"'-]", "", extracted_text)
    extracted_text = re.sub(r'\s+', ' ', extracted_text).strip()

    return extracted_text



In [9]:
def extract_controls_with_attributes(text):
    entries = list(re.finditer(r"(?P<section>\d{1,2}\.\d{1,2})\s(.+?)(?=(?:\d{1,2}\.\d{1,2})\s|$)", text, re.DOTALL))
    controls = {}

    for entry in entries:
        block = entry.group(0)
        number = entry.group("section")
        title_match = re.match(r"\d{1,2}\.\d{1,2}\s(.+?)\s(Purpose|Guidance|$)", block)
        title = title_match.group(1).strip() if title_match else "Unknown"

        purpose_match = re.search(r"Purpose\s(.*?)(?=Guidance|$)", block, re.DOTALL)
        guidance_match = re.search(r"Guidance\s(.*?)(?=\d{1,2}\.\d{1,2}\s|$)", block, re.DOTALL)

        temp_block = re.sub(r"Purpose\s.*?(?=Guidance|$)", "", block, flags=re.DOTALL)
        temp_block = re.sub(r"Guidance\s.*?(?=\d{1,2}\.\d{1,2}\s|$)", "", temp_block, flags=re.DOTALL)
        temp_block = re.sub(rf"^{number}\s+{re.escape(title)}", "", temp_block).strip()

        controls[f"{number}: {title}"] = {
            "control": sent_tokenize(temp_block) if temp_block else ["Not provided"],
            "purpose": purpose_match.group(1).strip() if purpose_match else "Not provided",
            "guidance": guidance_match.group(1).strip() if guidance_match else "Not provided"
        }

    return controls


In [10]:
def save_to_json(data,output_file):
    with open(output_file,"w",encoding = "utf-8") as f:
        json.dump(data,f,indent=2,ensure_ascii=False)

In [11]:
if __name__ == "__main__":
    pdf_path = "C:/Users/DELL/Desktop/Horizon17/ISO 27002.pdf"  # Replace with your actual file path
    output_json = "controlsv2.json"

    raw_text = extract_text_from_pdf(pdf_path)
    controls = extract_controls_with_attributes(raw_text)

    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(controls, f, indent=2, ensure_ascii=False)

    print(f" JSON saved to {output_json}")

 JSON saved to controlsv2.json


In [13]:
import json

with open("C:/Users/DELL/Desktop/Horizon17/sopcompliance/controlsv2.json", "r", encoding="utf-8") as f:
    data = json.load(f)

for key, value in data.items():
    print(f"\n {key}")
    print("• Control:")
    for sentence in value["control"]:
        print(f"  - {sentence}")
    print(f"• Purpose: {value['purpose']}/n")
    print(f"• Guidance: {value['guidance']}")