In [9]:
!pip install python-docx PyPDF2 --quiet

In [10]:
import os
from docx import Document
import PyPDF2
from google.colab import files

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    text = [para.text.strip() for para in doc.paragraphs if len(para.text.strip()) > 30]
    return text

def extract_text_from_pdf(file_path):
    text_list = []
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            raw_text = page.extract_text()
            if raw_text:
                lines = [line.strip() for line in raw_text.split('\n') if len(line.strip()) > 30]
                text_list.extend(lines)
    return text_list

In [11]:
print("📄 Upload ISO 27002 DOCX or PDF file:")
iso_upload = files.upload()
iso_file = list(iso_upload.keys())[0]

print("📄 Upload SOP DOCX or PDF file:")
sop_upload = files.upload()
sop_file = list(sop_upload.keys())[0]


📄 Upload ISO 27002 DOCX or PDF file:


Saving ISO 220072.pdf to ISO 220072 (1).pdf
📄 Upload SOP DOCX or PDF file:


Saving InformationSecurityPolicy-godfreyphillips.pdf to InformationSecurityPolicy-godfreyphillips (1).pdf


In [12]:
# Determine file type and extract text
def load_document(file_path):
    if file_path.endswith(".docx"):
        return extract_text_from_docx(file_path)
    elif file_path.endswith(".pdf"):
        return extract_text_from_pdf(file_path)
    else:
        raise ValueError("Unsupported file type. Please upload a .docx or .pdf file.")

iso_clauses = [{"id": f"ISO-{i+1}", "text": text} for i, text in enumerate(load_document(iso_file))]
sop_clauses = [{"id": f"SOP-{i+1}", "text": text} for i, text in enumerate(load_document(sop_file))]

# View sample data
print("\n📌 Sample ISO Clause:", iso_clauses[0])
print("📌 Sample SOP Clause:", sop_clauses[0])


📌 Sample ISO Clause: {'id': 'ISO-1', 'text': 'Information security, cybersecurity'}
📌 Sample SOP Clause: {'id': 'SOP-1', 'text': 'Information Security Policy                                            Internal                                                        Page 1 of 41'}


In [13]:
!pip install sentence-transformers scikit-learn --quiet

In [7]:
from huggingface_hub import login

# Paste your actual token between the quotes
login(token="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")

In [14]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load pre-trained model
model = SentenceTransformer('all-mpnet-base-v2')  # You can also use 'paraphrase-MiniLM-L6-v2' for speed

# Generate embeddings for ISO controls and SOP clauses
iso_texts = [clause['text'] for clause in iso_clauses]
sop_texts = [clause['text'] for clause in sop_clauses]

iso_embeddings = model.encode(iso_texts, convert_to_tensor=False)
sop_embeddings = model.encode(sop_texts, convert_to_tensor=False)

In [None]:
# Compare each ISO clause to all SOP clauses
matches = []

for i, iso_vec in enumerate(iso_embeddings):
    best_match_score = 0
    best_match_index = -1

    for j, sop_vec in enumerate(sop_embeddings):
        sim_score = cosine_similarity([iso_vec], [sop_vec])[0][0]
        if sim_score > best_match_score:
            best_match_score = sim_score
            best_match_index = j

    matches.append({
        "ISO ID": iso_clauses[i]["id"],
        "ISO Text": iso_clauses[i]["text"],
        "Best SOP ID": sop_clauses[best_match_index]["id"],
        "Best SOP Text": sop_clauses[best_match_index]["text"],
        "Similarity Score": round(best_match_score, 4)
    })

In [None]:
import pandas as pd

match_df = pd.DataFrame(matches)
display(match_df.head(5))  # Show top 5 matches