### 1: Clean up NIST Framework Excel

In [2]:
import pandas as pd
framework_file = "NIST-Privacy-Framework-V1.0-Core.xlsx"

In [None]:
df_fw = pd.read_excel(framework_file, sheet_name="Privacy Framework Core")
#clean data
df_fw.drop(labels=[0,1],axis=0, inplace=True) # first 2 rows
df_fw.drop(labels=df_fw.columns[0],axis=1, inplace=True) # drop irrelevant columns
df_fw.drop(labels=df_fw.columns[1],axis=1, inplace=True)
df_fw.drop(labels=df_fw.columns[2],axis=1, inplace=True)
df_fw.drop(labels=df_fw.columns[3:7],axis=1, inplace=True)
df_fw.columns = ["Function", "Category", "Framework Requirement"]
df_fw.fillna(method='ffill',inplace=True)

df_fw[['Function Code', 'Function Description']] = df_fw['Function'].str.split(':', n=1, expand=True)
df_fw.drop(labels=["Function"],axis=1, inplace=True)
df_fw['Function Description'] = df_fw['Function Description'].str.strip()

df_fw[['Category Code', 'Category Description']] = df_fw['Category'].str.split(':', n=1, expand=True)
df_fw.drop(labels=["Category"],axis=1, inplace=True)
df_fw['Category Description'] = df_fw['Category Description'].str.strip()

df_fw[['Framework Requirement Code', 'Framework Requirement Description']] = df_fw['Framework Requirement'].str.split(':', n=1, expand=True)
df_fw.drop(labels=["Framework Requirement"],axis=1, inplace=True)
df_fw['Framework Requirement Description'] = df_fw['Framework Requirement Description'].str.strip()

df_fw.reset_index(inplace=True,drop=True)
print(df_fw.shape)
# write cleaned up dataframe 
df_fw.to_csv("NIST-Privacy-Framework-structured.csv")
df_fw.head()

### 2. Generate Contextualized Framework Requirement

In [None]:
import os
import openai
import json
import tiktoken
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('max_colwidth', 0)

from pypdf import PdfReader
os.environ["OPENAI_API_KEY"] = "<OPENAI-API-KEY>"

from openai import OpenAI
client = OpenAI()
encoding = tiktoken.get_encoding("cl100k_base")

def call_chatgpt4(prompt):
    completion = client.chat.completions.create(
      model="gpt-4-turbo",
      #model="gpt-4o",
      messages=[
        {"role": "system", "content": "I am compliance manager and interested in mapping framework requirements from NIST frameworks to corresponding regulations."},
        {"role": "user", "content": prompt}
      ],
      temperature = 0.0,
      seed= 42,
      top_p = 0.98,
      max_tokens  = 4096
      #response_format={ "type": "json_object" }
    )
    #output_token_count = len(encoding.encode(completion.choices[0].message.content))

    #response_json = json.loads(completion.choices[0].message.content)
    
    return completion.choices[0].message.content

In [None]:
def get_contextualized_frameworkrequirements(framework_requirements, functional_objective, category):
    #print(framework_requirements)
    #print(functional_objective)
    analysis_prompt = """An item framework requirement from the NIST Privacy Framework is provided below:
    {framework_requirements}
    
    This framework requirement is tagged under the function objective:
    {functional_objective} 
    
    and  category:
    {category} 

    Please give a short succinct contextualized framework requirements that captures the requirement within the context of function 
    and category. Make sure to include all details from the requirement. 
    The contextualized framework requirement will be used for the purposes of improved embedding to enable better mapping with regulations. 
    Answer only with the succinct contextualized framework requirement and nothing else. 
    """
    return analysis_prompt.format(framework_requirements=framework_requirements,functional_objective=functional_objective,category=category)

def call_contextfw_func(row):
    fw_requirement = row["Framework Requirement Description"]
    func_objective = row["Function Description"]
    category = row["Category Description"]
    contextfw_prompt = get_contextualized_frameworkrequirements(fw_requirement,func_objective,category)
    response_chatgpt4 = call_chatgpt4(contextfw_prompt)
    return response_chatgpt4

In [None]:
df_fw["Contextualized Framework Requirement Description"] = df_fw.apply(call_contextfw_func,axis=1)

In [None]:
df_fw.to_csv("NIST-Privacy-Framework-structured-withcontextFWReq.csv")
df_fw.head()

In [None]:
df_fw.shape

### 3. Embeddings of the contextualized framework requirements

In [None]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import  VectorStoreIndex
from llama_index.core import Settings

EMBEDDING_MODEL  = "text-embedding-3-small"
GENERATION_MODEL = "gpt-4"

llm = OpenAI(model=GENERATION_MODEL)

Settings.llm = llm

In [None]:
from openai import OpenAI
openai_client = OpenAI (api_key = os.environ["OPENAI_API_KEY"])
def get_embedding(client, text, model):
    response = client.embeddings.create(
                    input=text,
                    model=model,
                )
    return response.data[0].embedding
def call_embedding_func(row):
    embedding = get_embedding(openai_client, row["Contextualized Framework Requirement Description"], EMBEDDING_MODEL)
    return embedding

In [None]:
df_fw["Contextualized Framework Requirement Embedding"] = df_fw.apply(call_embedding_func,axis=1)

In [None]:
df_fw.to_csv("NIST-Privacy-Framework-structured-withcontextFWReq-withEmbedding.csv")
df_fw.head()

In [None]:
df_fw.shape

In [4]:
df_fw = pd.read_csv("NIST-Privacy-Framework-structured-withcontextFWReq-withEmbedding.csv",index_col=0)
df_fw.head()

Unnamed: 0,Function Code,Function Description,Category Code,Category Description,Framework Requirement Code,Framework Requirement Description,Contextualized Framework Requirement Description,Contextualized Framework Requirement Embedding
0,IDENTIFY-P (ID-P),Develop the organizational understanding to ma...,Inventory and Mapping (ID.IM-P),"Data processing by systems, products, or servi...",ID.IM-P1,Systems/products/services that process data ar...,Maintain a comprehensive inventory of all syst...,"[0.014797082170844078, 0.03422795608639717, 0...."
1,IDENTIFY-P (ID-P),Develop the organizational understanding to ma...,Inventory and Mapping (ID.IM-P),"Data processing by systems, products, or servi...",ID.IM-P2,"Owners or operators (e.g., the organization or...",Maintain a comprehensive inventory of all owne...,"[0.017082873731851578, 0.04562059044837952, 0...."
2,IDENTIFY-P (ID-P),Develop the organizational understanding to ma...,Inventory and Mapping (ID.IM-P),"Data processing by systems, products, or servi...",ID.IM-P3,"Categories of individuals (e.g., customers, em...",Maintain an inventory of categories of individ...,"[0.013009565882384777, 0.01754865236580372, 0...."
3,IDENTIFY-P (ID-P),Develop the organizational understanding to ma...,Inventory and Mapping (ID.IM-P),"Data processing by systems, products, or servi...",ID.IM-P4,Data actions of the systems/products/services ...,**Contextualized Framework Requirement:**\nEns...,"[-0.0032425932586193085, 0.049907196313142776,..."
4,IDENTIFY-P (ID-P),Develop the organizational understanding to ma...,Inventory and Mapping (ID.IM-P),"Data processing by systems, products, or servi...",ID.IM-P5,The purposes for the data actions are inventor...,Inventory the purposes of data actions to enha...,"[0.010523432865738869, 0.03671571612358093, 0...."
