In [2]:
from dotenv import load_dotenv
load_dotenv()
import os

In [88]:
import json
with open('data_index.json', 'r') as f:
    data_index = json.load(f)
data_index

{'1017': '1017_sci-news.com.html',
 '1028': '1028_redorbit.com.html',
 '1034': '1034_aol.com.html',
 '1050': '1050_honda.com.html',
 '105': '105_amazon.com.html',
 '1070': '1070_wnep.com.html',
 '1083': '1083_highgearmedia.com.html',
 '1089': '1089_freep.com.html',
 '1099': '1099_enthusiastnetwork.com.html',
 '1106': '1106_allstate.com.html',
 '1164': '1164_acbj.com.html',
 '1205': '1205_opensecrets.org.html',
 '1206': '1206_dcccd.edu.html',
 '1221': '1221_gwdocs.com.html',
 '1224': '1224_austincc.edu.html',
 '1252': '1252_cincymuseum.org.html',
 '1259': '1259_fool.com.html',
 '1261': '1261_zacks.com.html',
 '1264': '1264_citizen.org.html',
 '1300': '1300_bankofamerica.com.html',
 '1306': '1306_chasepaymentech.com.html',
 '133': '133_fortune.com.html',
 '135': '135_instagram.com.html',
 '1360': '1360_thehill.com.html',
 '1361': '1361_yahoo.com.html',
 '1419': '1419_miaminewtimes.com.html',
 '144': '144_style.com.html',
 '1468': '1468_rockstargames.com.html',
 '1470': '1470_steampowered

In [92]:
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

from langchain.llms import OpenAI
embeddings = OpenAIEmbeddings()
llm = OpenAI(temperature=0)


def make_data_path(index_number):
    return os.path.join(
        'data/OPP-115/sanitized_policies/', 
        data_index[str(index_number)],
    )

def make_db_path(input_file):
    return os.path.join('db', f'{input_file}.faiss')
    

def make_faiss_db(input_file):
    raw_documents = TextLoader(input_file).load()
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separator='<br>')
    documents = text_splitter.split_documents(raw_documents)

    db = FAISS.from_documents(documents, embeddings)
    db.save_local(make_db_path(input_file))
    return db

# make_faiss_db('sample_terms/baseline.txt')
# make_faiss_db('sample_terms/malicious.txt')

In [15]:
input_file = 'sample_terms/baseline.txt'
db1 = FAISS.load_local(os.path.join('db', f'{input_file}.faiss'), embeddings)
input_file = 'sample_terms/malicious.txt'
db2 = FAISS.load_local(os.path.join('db', f'{input_file}.faiss'), embeddings)

In [66]:
def query_data_collected(doc_db):
    query = "What personal data is collected?"
    docs = doc_db.similarity_search(query)
    search_result = '\n'.join(d.page_content for d in docs)

    return llm.predict(f'''
   Here are fragments of a privacy policy.

    For each of the item below, check if it is collected according to the framents:           
    - email address 
    - first and last name 
    - phone number 
    - postal address 
    - zip code 
    - IP address 
    - browser used 
    - device identifier 
    - social media info 
    - contacts or connections 
    - phone book 
    - photos or videos 
    - credit card or bank account info 
    - gender 
    - date and time of visit 
    - social security number 
    - drivers license number 
    - location info 

    Each of the above should be assigned a boolean value.

    Return a YAML

    In addition, add an extra field:
    other: a string under 15 words describing what is collected other than the above
                
    fragments of a privacy policy: {search_result}
    list of personal data items:

    ''')

# query_data_collected(db1), query_data_collected(db2), 

(' email address: true\n     first and last name: true\n     phone number: true\n     postal address: true\n     zip code: true\n     IP address: true\n     browser used: true\n     device identifier: true\n     social media info: true\n     contacts or connections: true\n     phone book: true\n     photos or videos: true\n     credit card or bank account info: false\n     gender: false\n     date and time of visit: true\n     social security number: false\n     drivers license number: false\n     location info: true\n     other: Usage Data, Third-Party Social Media Service info',
 ' email address: true\n     first and last name: true\n     phone number: true\n     postal address: true\n     zip code: true\n     IP address: true\n     browser used: true\n     device identifier: true\n     social media info: true\n     contacts or connections: true\n     phone book: true\n     photos or videos: true\n     credit card or bank account info: true\n     gender: true\n     date and time of v

In [96]:
subset_data = [1017, 1028, 1034]

In [97]:
for i in subset_data:
    p = make_data_path(i)
    make_faiss_db(p)

Created a chunk of size 1066, which is longer than the specified 1000
Created a chunk of size 1010, which is longer than the specified 1000
Created a chunk of size 1249, which is longer than the specified 1000
Created a chunk of size 1267, which is longer than the specified 1000
Created a chunk of size 1419, which is longer than the specified 1000


In [98]:
for i in subset_data:
    p = make_data_path(i)
    db = FAISS.load_local(make_db_path(p), embeddings)
    r = query_data_collected(db)
    print(i, r)

data/OPP-115/sanitized_policies/1017_sci-news.com.html  email address: true
     first and last name: true
     phone number: true
     postal address: true
     zip code: true
     IP address: true
     browser used: true
     device identifier: true
     social media info: true
     contacts or connections: true
     phone book: false
     photos or videos: false
     credit card or bank account info: false
     gender: true
     date and time of visit: true
     social security number: false
     drivers license number: false
     location info: true
     other: IP address, device identifier
data/OPP-115/sanitized_policies/1028_redorbit.com.html  email address: true
     first and last name: true
     phone number: true
     postal address: true
     zip code: true
     IP address: true
     browser used: true
     device identifier: true
     social media info: true
     contacts or connections: true
     phone book: false
     photos or videos: false
     credit card or bank accou