In [1]:
pip install python-docx PyPDF2 scikit-learn

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting lxml>=3.1.0 (from python-docx)
  Downloading lxml-5.4.0-cp313-cp313-win_amd64.whl.metadata (3.6 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Downloading lxml-5.4.0-cp313-cp313-win_amd64.whl (3.8 MB)
   ---------------------------------------- 0.0/3.8 MB ? eta -:--:--
   --------------------------- ------------ 2.6/3.8 MB 12.4 MB/s eta 0:00:01
   ---------------------------------------- 3.8/3.8 MB 11.9 MB/s eta 0:00:00
Installing collected packages: PyPDF2, lxml, python-docx

   ---------------------------------------- 0/3 [PyPDF2]
   ---------------------------------------- 0/3 [PyPDF2]
   ---------------------------------------- 0/3 [PyPDF2]
   ---------------------------------------- 0/3 [PyPDF2]
   ---------------------------------------- 0

In [None]:
import os
import PyPDF2
from docx import Document
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# --- PDF extraction ---
def extract_text_from_pdf(file_path):
    text_list = []
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            raw_text = page.extract_text()
            if raw_text:
                lines = [line.strip() for line in raw_text.split('\n') if len(line.strip()) > 30]
                text_list.extend(lines)
    return text_list

# --- DOCX extraction ---
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    text = [para.text.strip() for para in doc.paragraphs if len(para.text.strip()) > 30]
    return text

# --- Determine file type and extract text ---
def load_document(file_path):
    if file_path.endswith(".docx"):
        return extract_text_from_docx(file_path)
    elif file_path.endswith(".pdf"):
        return extract_text_from_pdf(file_path)
    else:
        raise ValueError("Unsupported file type. Please use a .pdf or .docx file.")


Extracted PDF Text:
Information security, cybersecurity
and privacy protection  — Information
Sécurité de l'information, cybersécurité et protection de la vie
privée  — Mesures de sécurité de l'informationINTERNATIONAL
All rights reserved. Unless otherwise specified, or required in the context of its implementation, no part of this publication may
be reproduced or utilized otherwise in any form or by any means, electronic or mechanical, including photocopying, or posting on
the internet or an intranet, without prior written permission. Permission can be requested from either ISO at the address below
or ISO’s member body in the country of the requester.
ISO copyright officeCP 401 • Ch. de Blandonnet 8CH-1214 Vernier, GenevaPhone: +41 22 749 01 11
© I SO/IEC 2022 – All rights reserved
Foreword  ....................................................................................................................................................................................................

In [None]:
import tkinter as tk
from tkinter import filedialog

# Initialize hidden root window
root = tk.Tk()
root.withdraw()

print("📄 Select ISO 27002 DOCX or PDF file:")
iso_file = filedialog.askopenfilename(filetypes=[("PDF files", "*.pdf"), ("DOCX files", "*.docx")])

print("📄 Select SOP DOCX or PDF file:")
sop_file = filedialog.askopenfilename(filetypes=[("PDF files", "*.pdf"), ("DOCX files", "*.docx")])

print("✅ ISO file selected:", iso_file)
print("✅ SOP file selected:", sop_file)

# --- Extract content and assign IDs ---
iso_clauses = [{"id": f"ISO-{i+1}", "text": text} for i, text in enumerate(load_document(iso_file))]
sop_clauses = [{"id": f"SOP-{i+1}", "text": text} for i, text in enumerate(load_document(sop_file))]

print("\n📌 Sample ISO Clause:", iso_clauses[0])
print("📌 Sample SOP Clause:", sop_clauses[0])

📄 Select ISO 27002 DOCX or PDF file:
📄 Select SOP DOCX or PDF file:
✅ ISO file selected: C:/Users/hp/Desktop/SOPs/InformationSecurityPolicy-godfreyphillips.pdf
✅ SOP file selected: C:/Users/hp/Downloads/ISO 220072.pdf


In [None]:
# --- Clause Similarity Comparison ---
def compare_clauses(iso_clauses, sop_clauses, threshold=0.5):
    iso_texts = [c['text'] for c in iso_clauses]
    sop_texts = [c['text'] for c in sop_clauses]
    vectorizer = TfidfVectorizer().fit(iso_texts + sop_texts)
    iso_vecs = vectorizer.transform(iso_texts)
    sop_vecs = vectorizer.transform(sop_texts)
    sim_matrix = cosine_similarity(iso_vecs, sop_vecs)
    results = []
    for i, iso_clause in enumerate(iso_clauses):
        best_idx = np.argmax(sim_matrix[i])
        best_score = sim_matrix[i, best_idx]
        if best_score >= threshold:
            results.append({
                'iso_id': iso_clause['id'],
                'iso_text': iso_clause['text'],
                'sop_id': sop_clauses[best_idx]['id'],
                'sop_text': sop_clauses[best_idx]['text'],
                'similarity': best_score
            })
        else:
            results.append({
                'iso_id': iso_clause['id'],
                'iso_text': iso_clause['text'],
                'sop_id': None,
                'sop_text': None,
                'similarity': best_score
            })
    return results

# --- Run comparison and print summary ---
comparison = compare_clauses(iso_clauses, sop_clauses, threshold=0.5)

print(f"\n{'ISO ID':<10} | {'Best SOP ID':<10} | {'Similarity':<10}")
print("-"*60)
for row in comparison:
    print(f"{row['iso_id']:<10} | {str(row['sop_id']):<10} | {row['similarity']:.2f}")

# Optionally, print unmatched ISO clauses
gaps = [row for row in comparison if row['sop_id'] is None]
if gaps:
    print(f"\nUnmatched ISO Clauses ({len(gaps)}):")
    for gap in gaps:
        print(f"{gap['iso_id']}: {gap['iso_text'][:80]}...")
else:
    print("\nAll ISO clauses have a matching SOP clause above the threshold.")