In [8]:
import pymupdf
import re

In [None]:
import pymupdf
import re

doc = pymupdf.open("Information-Security-Policy_230209_EN.pdf")
out = open("output.txt", "w", encoding="utf8")
for page in doc: 
    text = page.get_text() 
    lines = text.splitlines()
    for line in lines:
        
        if not re.match(r"^(page\s+\d+\s+of\s+\d+)$|^(Page\s+\d+\s+of\s+\d+)$|^(\d+\s*/\s*\d+)$|^(\d+\s+of\s+\d+)$", line.strip(), re.IGNORECASE):
            out.write(line + "\n") 

out.close()


In [None]:
import re, json
with open('output.txt', 'r', encoding='utf8') as f:
    lines = f.read().splitlines()
controls = {}
current_id = None
for line in lines:
    m = re.match(r'^([A-Z]\.[0-9]+\.[0-9]+)\s+(.*)', line)
    if m:
        current_id, desc = m.groups()
        controls[current_id] = desc.strip()
    elif current_id and line.strip():
        controls[current_id] += ' ' + line.strip()
with open('sop_controls.json', 'w', encoding='utf8') as f:
    json.dump(controls, f, indent=2)
print(f"Extracted {len(controls)} controls from SOP")

In [None]:
# Load controls
import json
with open('sop_controls.json', 'r', encoding='utf8') as f:
    sop_controls = json.load(f)
with open('iso27002.json', 'r', encoding='utf8') as f:
    iso_controls = json.load(f)

# Prepare texts for embedding
sop_texts = [desc for desc in sop_controls.values()]
iso_texts = [desc for desc in iso_controls.values()]
iso_keys = list(iso_controls.keys())

# Generate embeddings
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-roberta-large-v1')

sop_emb = model.encode(sop_texts, convert_to_tensor=True)
iso_emb = model.encode(iso_texts, convert_to_tensor=True)

# Compute similarities and mappings
import torch
from torch.nn.functional import cosine_similarity

mappings = {}
for i, key in enumerate(sop_controls):
    sims = cosine_similarity(sop_emb[i].unsqueeze(0), iso_emb)
    best_idx = torch.argmax(sims).item()
    mappings[key] = {
        'sop_desc': sop_controls[key],
        'matched_iso': iso_keys[best_idx],
        'iso_desc': iso_controls[iso_keys[best_idx]],
        'similarity': sims[0][best_idx].item()
    }

# Save mappings
with open('sop_iso_mapping.json', 'w', encoding='utf8') as f:
    json.dump(mappings, f, indent=2)

print(f"Mapped {len(mappings)} SOP controls to ISO 27002 standards.")