In [1]:
import os
print(" Current working directory:", os.getcwd())

 Current working directory: C:\Users\DELL\Desktop\Horizon17\sopcompliance


In [27]:
import os
import re 
import json 
from collections import Counter 
import fitz
from nltk.tokenize import sent_tokenize

In [28]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
def extract_text_from_pdf(path):
    text_pg = []
    with fitz.open(path) as doc:
        for page in doc:
            text_pg.append(page.get_text())
    return"\n".join(text_pg)

In [29]:
def clean_text(text):
    text = re.sub(r"[^a-zA-Z0-9.,:;()\-\n\s]", " ", text)
    text = re.sub(r"[\n\r\t]+", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)
    sentences = sent_tokenize(text)
    sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
    return "\n".join(sentences)

In [20]:
def remove_h_f(text,threshold = 2):
    lines = text.split("\n")
    line_counts = Counter(lines)
    filtered_lines = [line for line in lines if line_counts[line] <= threshold and line.strip() !='']
    return "\n".join(filtered_lines)

In [21]:
def extract_and_clean(filepath):
    ext = filepath.lower().split('.')[-1]
    if ext == 'pdf':
        raw_text = extract_text_from_pdf(filepath)
    else:
        raise ValueError(f"Unsupported file format: {ext}")

    cleaned = clean_text(raw_text)
    cleaned = remove_h_f(cleaned)
    return cleaned

In [22]:
def process_and_save_sops_to_json(folder_path, output_json_path):
    results = []
    for fname in os.listdir(folder_path):
        if fname.lower().endswith(('.pdf', '.docx', '.txt')):
            path = os.path.join(folder_path, fname)
            try:
                text = extract_and_clean(path)
                results.append({
                    "filename": fname,
                    "text": text
                })
                print(f"[INFO] Processed {fname}")
            except Exception as e:
                print(f"[ERROR] Could not process {fname}: {e}")

    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"[INFO] Saved cleaned SOPs to {output_json_path}")


In [30]:
if __name__ == "__main__":
    folder = "./SOPs"  # Folder with SOP files
    output_json = "cleaned_sopsv2.json"
    process_and_save_sops_to_json(folder, output_json)

[INFO] Processed 300-POL-001_EISP Policy.pdf
[INFO] Processed 31_Política de Seguridad de la Información_EN.pdf
[INFO] Processed FIVE_Information-Secuirty-Policy.pdf
[INFO] Processed ICS_Policy_2023-NPS_Trust.pdf
[INFO] Processed Information security and DP incident reporting 1.5.pdf
[INFO] Processed Information-Security-Policy-godrej.pdf
[INFO] Processed Information-Security-Policy_230209_EN.pdf
[INFO] Processed Information-Security_Policy-Statement.pdf
[INFO] Processed InformationSecurityPolicy-godfreyphillips.pdf
[INFO] Processed informationsecuritypolicy-uni-of-liverpool.pdf
[INFO] Processed Information_Security_Policy_paytm.pdf
[INFO] Processed PDS-Information-Security-Policy-v2.0.pdf
[INFO] Processed rti-information_security_policy_2023.docx_1.pdf
[INFO] Processed SOP1.pdf
[INFO] Processed SOP2.pdf
[INFO] Processed SOP3.pdf
[INFO] Saved cleaned SOPs to cleaned_sopsv2.json
