### 1. Read the DPTM and NIST processed files

In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
fw_file = "NIST-Privacy-Framework-structured-withcontextFWReq-withEmbedding.csv"
reg_file = "DPTM_Checklist_Full-withEmbedding.csv"

In [None]:
df_fw = pd.read_csv(fw_file)
df_fw.drop(labels=["Unnamed: 0"],axis=1,inplace=True)

df_reg = pd.read_csv(reg_file)
df_reg.drop(labels=["Unnamed: 0"],axis=1,inplace=True)

In [None]:
df_fw.shape # Number of Framework Requirements

In [None]:
df_fw.head(2)

In [None]:
df_reg.shape # Number of Regulations

In [None]:
df_reg.head(2)

### 2. Calculate cosine similarity of the embeddings

In [None]:
import ast
dptm_vectors = df_reg["Description Embedding"].tolist()
nist_vectors = df_fw["Contextualized Framework Requirement Embedding"].tolist()
dptm_vectors = [ast.literal_eval(x) for x in dptm_vectors]
nist_vectors = [ast.literal_eval(x) for x in nist_vectors]

In [None]:
similarity_matrix = cosine_similarity(dptm_vectors,nist_vectors) # We can start looking at similarities from which place one starts
similarity_matrix.shape

In [None]:
df_new = pd.DataFrame(np.repeat(df_reg.values, 5, axis=0))
df_new.columns = df_reg.columns
df_new.head(1)

### 3. Map each DPTM requirement to Top5 similar NIST framework requirements

In [None]:
for i in range(len(df_reg)):
#for i in range(2):
    indices_of_interest = np.argsort(similarity_matrix[i,:])[-5:][::-1]
    similarity_score = similarity_matrix[i,indices_of_interest]
    for count,fw_index in enumerate(indices_of_interest):
        matched_framework_reqs = df_fw.iloc[fw_index]
        df_new.loc[i*5+count,'Reg-FW-Similarity-Score'] = similarity_score[count]
        df_new.loc[i*5+count,'Framework Requirement Description'] = matched_framework_reqs['Framework Requirement Description']
        df_new.loc[i*5+count,'Contextualized Framework Requirement Description'] = matched_framework_reqs['Contextualized Framework Requirement Description']
        df_new.loc[i*5+count,'Category Code'] = matched_framework_reqs['Category Code']
        df_new.loc[i*5+count,'Function Code'] = matched_framework_reqs['Function Code']

In [None]:
df_new.head()

In [None]:
#df_new.drop(labels=["Policy Type", "Policy Documents","Description Embedding"],axis=1,inplace=True)
df_new.to_csv("DPTM_to_NIST_Mapping.csv")
df_new.head()

In [3]:
df_new = pd.read_csv("DPTM_to_NIST_Mapping.csv", index_col=0)
df_new.head()

Unnamed: 0,Category,Title,Description,Policy Type,Policy Documents,Description Embedding,Reg-FW-Similarity-Score,Framework Requirement Description,Contextualized Framework Requirement Description,Category Code,Function Code
0,Governance and Transparency,Establish data protection policies and practices,Organisation shall have data protection polici...,Internal,,"[0.00333080324344337, 0.02203933335840702, 0.0...",0.606939,"Policies, processes, and procedures for enabli...","Establish and maintain policies, processes, an...","Data Processing Policies, Processes, and Proce...",CONTROL-P (CT-P)
1,Governance and Transparency,Establish data protection policies and practices,Organisation shall have data protection polici...,Internal,,"[0.00333080324344337, 0.02203933335840702, 0.0...",0.601434,Policy and regulations regarding the physical ...,Ensure organizational policies and regulations...,"Data Protection Policies, Processes, and Proce...",PROTECT-P (PR-P)
2,Governance and Transparency,Establish data protection policies and practices,Organisation shall have data protection polici...,Internal,,"[0.00333080324344337, 0.02203933335840702, 0.0...",0.597835,Organizational privacy values and policies (e....,Establish and communicate organizational priva...,"Governance Policies, Processes, and Procedures...",GOVERN-P (GV-P)
3,Governance and Transparency,Establish data protection policies and practices,Organisation shall have data protection polici...,Internal,,"[0.00333080324344337, 0.02203933335840702, 0.0...",0.590087,Protection processes are improved.,Enhance protection processes to align with sec...,"Data Protection Policies, Processes, and Proce...",PROTECT-P (PR-P)
4,Governance and Transparency,Establish data protection policies and practices,Organisation shall have data protection polici...,Internal,,"[0.00333080324344337, 0.02203933335840702, 0.0...",0.584986,Privacy procedures are included in human resou...,Integrate privacy procedures within human reso...,"Data Protection Policies, Processes, and Proce...",PROTECT-P (PR-P)
