### 1. Convert DPTM-Checklist pdf to tabular structure using CoT prompting

In [1]:
import os
import openai
import json
import tiktoken
import pandas as pd
from pypdf import PdfReader
regulatory_doc_filename = "DPTM-Checklist.pdf"
from pypdf import PdfReader
os.environ["OPENAI_API_KEY"] = "<OPENAI-API-KEY>"

from openai import OpenAI
client = OpenAI()
encoding = tiktoken.get_encoding("cl100k_base")

In [None]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import  VectorStoreIndex
from llama_index.core import Settings

EMBEDDING_MODEL  = "text-embedding-3-small"
GENERATION_MODEL = "gpt-4"

llm = OpenAI(model=GENERATION_MODEL)

Settings.llm = llm

In [None]:
def call_chatgpt4(prompt):
    completion = client.chat.completions.create(
      model="gpt-4-turbo",
      #model="gpt-4o",
      messages=[
        {"role": "system", "content": "I am an auditor. My job is to perform regulatory assessment and compliance evaluation."},
        {"role": "user", "content": prompt}
      ],
      temperature = 0.0,
      seed= 42,
      top_p = 0.98,
      max_tokens  = 4096,
      response_format={ "type": "json_object" }
    )
    #output_token_count = len(encoding.encode(completion.choices[0].message.content))

    response_json = json.loads(completion.choices[0].message.content)
    
    return response_json

In [None]:
list_of_categories = ["Governance and Transparency","Management of Personal Data", "Care of Personal Data", "Individual’s Rights"]
def text_from_pdf(filename):
    # creating a pdf reader object
    reader = PdfReader(filename)
    text = ""
    for pageno in range(len(reader.pages)):
        page = reader.pages[pageno]
        text += page.extract_text()
    #len(regulatory_text)
    return text
regulatory_text = text_from_pdf(regulatory_doc_filename)

In [None]:
def regulatory_requirement_extraction_forcategory(regulatory_text_fromfile, category_of_interest):
    analysis_prompt = """
    You are an auditor in charge of assessing privacy regulations in Singapore. The  Data Production Trustmark (DPTM) checklist applicable for Singapore is provided between the <data> XML like tags.
    Your objective is to extract all regulatory requirements from the document. 
    Follow the detailed instructions below to analyze regulatory document:
    
    1. Read through the entire regulatory document . 
    2. For each checklist item, identify the category of the checklist requirement, the subcategory and finally who it applies to and the corresponding policy document on which details needs to be found.  
    3. Use only the information provided in the regulatory document
    4. For each item on the checklist corresponding to ONLY the category "{category_of_interest}" extract the following information:
        Category: The principle on which the checklist requirement is based.  
        Title: The subcategory to which the checklist requirement belongs.
        Description: The exact checklist requirement from the "checklist" column of the given text. Do not summarise or combine the checklist description from multiple rows. If the same checklist is applicable to different stakeholders make sure to include each as a separate requirement with the information of the stakeholder. If there are different items that need to be checked, list each requirement as a separate row.  
        Stakeholder: The stakeholder to whom the checklist requirement applies
        Mapped Policy Document: An educated guess the policy document that would have details for the checklist requirement. If not known, specify UNKNOWN
        
        The regulatory document is provided below:

        <data>
        {regulatory_text}
        </data>

        Provide the output in the following strictly in serializable JSON format with correct indent all over the output.    
    
        {{
            "Checklist_requirements": [
            {{
            "Category" : The principle on which the checklist requiremnt is based as a string,
            "Title": A short 5-6 word subcategory to which the checklist requirement belongs to as a string,
            "Description" : The complete description of the checklist requirement as string,
            "Stakeholder" : The stakeholder to whom the checklist requirement applies to as a string,
            "Mapped_Policy_Document" : "The policy document name (5-6 words) to check for this requirement"
            }},
            ....
            ]
        }}
   

    """
    return analysis_prompt.format(regulatory_text=regulatory_text_fromfile,category_of_interest=category_of_interest)

In [None]:
# Looping thru the list of splits
list_of_categories = ["Governance and Transparency","Management of Personal Data", "Care of Personal Data", "Individual’s Rights"]
#header_text = regulatory_text.split("Principle ")[0] + "Principle "
#for index, text in enumerate(regulatory_text.split("Principle ")[1:]):
for index,category in enumerate(list_of_categories):
    
    prompt_withcategory = regulatory_requirement_extraction_forcategory(regulatory_text,category)
    #print(category)
    #print(len(prompt_withcategory))
    response_json_chatgpt4 = call_chatgpt4(prompt_withcategory)

    #print(header_text+text)
    print("Num of requirements extracted")
    print(len(response_json_chatgpt4["Checklist_requirements"]))
    df = pd.DataFrame(response_json_chatgpt4["Checklist_requirements"])
    if index > 0:
        df_checklist_req = pd.concat([df_checklist_req, df], ignore_index = True) 
    else:
        df_checklist_req = df.copy()

df_checklist_req.shape

In [None]:
df_checklist_req.head()

In [None]:
df_checklist_req.to_csv("DPTM_Checklist_Full.csv")

### 2. Read DPTM checklist and create embeddings of "Descriptions"

In this work we consider these Descriptions as Regulatory Requirements

In [None]:
#DPTM_file = "DPTM_Checklist_CoT.csv"
DPTM_file = "DPTM_Checklist_Full.csv"
df_reg = pd.read_csv(DPTM_file, index_col=None)
df_reg.drop(labels=["Unnamed: 0"],axis=1,inplace=True)
print(df_reg.shape)
df_reg.head()

In [None]:
openai_client = OpenAI (api_key = os.environ["OPENAI_API_KEY"])
def get_embedding(client, text, model):
    response = client.embeddings.create(
                    input=text,
                    model=model,
                )
    return response.data[0].embedding
def call_embedding_func(row):
    embedding = get_embedding(openai_client, row["Description"], EMBEDDING_MODEL)
    return embedding

In [None]:
df_reg["Description Embedding"] = df_reg.apply(call_embedding_func,axis=1)

In [None]:
#df_reg.to_csv("DPTM_Checklist_CoT-withEmbedding.csv")
df_reg.to_csv("DPTM_Checklist_Full-withEmbedding.csv")
df_reg.head()

In [None]:
df_reg.shape

In [6]:
df_reg = pd.read_csv("DPTM_Checklist_Full-withEmbedding.csv",index_col=0)
df_reg.head()

Unnamed: 0,Category,Title,Description,Policy Type,Policy Documents,Description Embedding
0,Governance and Transparency,Establish data protection policies and practices,Organisation shall have data protection polici...,Internal,,"[0.00333080324344337, 0.02203933335840702, 0.0..."
1,Governance and Transparency,Establish data protection policies and practices,Organisation shall have data protection polici...,External,,"[0.006035863421857357, 0.01567075029015541, 0...."
2,Governance and Transparency,Establish data protection policies and practices,Organisation shall have data protection polici...,Internal,,"[-0.00540529889985919, 0.01313601341098547, 0...."
3,Governance and Transparency,Establish data protection policies and practices,Organisation shall publish and communicate the...,External,,"[0.025840099900960922, 0.014451069757342339, 0..."
4,Governance and Transparency,Establish data protection policies and practices,Organisation shall publish and communicate the...,Internal,,"[0.024895088747143745, 0.03457169979810715, 0...."
