In [None]:
%pip install langchain
%pip install openai

In [1]:
import datetime
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
from langchain.chat_models import ChatOpenAI
# Import Taylor's retriever
from src.OfCounselRetriever import SimilarityOfCounselRetriever
from src.tracking_inputs_outputs import DbInputsOutputs

{
"answer": "Businesses that collect personal information have several general duties. They must inform consumers at or before the point of collection about the business or commercial purpose for collecting, selling, or sharing personal information. They also need to inform consumers about the categories of personal information that the business collected about them.",
"sources": ["CIV1798.100.20204", "CIV1798.110.20207_a_3", "CIV1798.110.20207_c_3", "CIV1798.100.20204_a", "CIV1798.115.20208_a_1"]
}


In [7]:
import os
os.chdir('/Users/gr703z/launchable/gleb')

In [8]:
# Open the object for tracking inputs and outputs
db_inputs_outputs = DbInputsOutputs("data/tracking_inputs_outputs.csv")
db_inputs_outputs.print_data()

                                               query        policy_descr  \
0  ACT as a corporate attorney whose job is to ch...  Section 9 original   
1  ACT as a corporate attorney whose job is to ch...  Section 9 original   
2  ACT as a corporate attorney whose job is to ch...  Section 9 original   
3  ACT as a corporate attorney whose job is to ch...  Section 9 redacted   
4  ACT as a corporate attorney whose job is to ch...  Section 9 redacted   

                     llm                    retriever  \
0  gpt-4, temperature .2              Basic retriever   
1   gpt-4, temperature 0              Basic retriever   
2   gpt-4, temperature 0  Taylor retriever Similarity   
3   gpt-4, temperature 0              Basic retriever   
4   gpt-4, temperature 0  Taylor retriever Similarity   

                                              answer  notes  \
0  {\n  "answer": "The provided policy appears to...    NaN   
1  {\n  "answer": "The policy section does not co...    NaN   
2  {\n"answ

In [9]:
# Function to split a markdown file into sections by header of 1st level (e.g. # Header)

def split_md_sections(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()

    sections = []
    current_section = ""

    for line in lines:
        if line.startswith("# "):
            if current_section:
                sections.append(current_section)
            current_section = line
        else:
            current_section += line

    if current_section:
        sections.append(current_section)

    return sections

# Example usage
sections = split_md_sections("data/RecRoom_complete.md")
sections[0]

'# Privacy Policy\n\nEffective: July 25, 2023\n\nRec Room Inc., a Delaware corporation ("Company", "we", "our", and their derivatives) provides the websites, [http://www.recroom.com](http://www.recroom.com/), [http://www.rec.net](http://www.rec.net/), and the subdomains of each of the foregoing (collectively, the "Website") and the Rec Room® video game (the "Game" and, with the Website, the "Services").\n\nThis Policy sets forth how we collect, use, protect, store, and otherwise process your Personal Information (defined below). This Policy does NOT apply to information we collect offline or you provide to or is collected by any third party (except as otherwise provided below).\n\nFor our practices regarding children, please see the Children\'s section in Section 2 below.\n\n'

# Create a RAG

In [10]:
# create a chat model using openAI's gpt-4 model
chat = ChatOpenAI(model_name='gpt-4', temperature=0)

In [11]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.chains import create_qa_with_sources_chain
from langchain.chains import RetrievalQA

qa_chain = create_qa_with_sources_chain(chat)


doc_prompt = PromptTemplate(
    template="Relevant piece from the legal code: \n {page_content} \nSource: {source}",
    input_variables=["page_content", "source"],
)

final_qa_chain = StuffDocumentsChain(
    llm_chain=qa_chain,
    document_variable_name="context",
    document_prompt=doc_prompt,
)

retrieval_qa = RetrievalQA(
    retriever=SimilarityOfCounselRetriever(),
    combine_documents_chain=final_qa_chain
)

In [12]:
def run_section_through_chain(pre_prompt: str, policy_section: str, retrieval_qa: RetrievalQA):
    query = f"""{pre_prompt}

    *** Policy section ***
    {policy_section}

    """
    answer = retrieval_qa.run(query)
    return answer


# Test the RAG on a single section

In [15]:
pre_prompt_analysis_only = """ACT as a corporate attorney whose job is to check the compliance of the company's policy with the state's new laws.

Your task is to identify if the provided policy section complies with the new law.

If the policy section complies with the new law, then say so and do nothing more.

Otherwise, if the policy section does not comply with the law, make a general assessment how bad is the violation, and make a numbered list of what is missing from the policy or what is wrong with the policy, prioritizing the most important issues first. Make sure to mention the name and/or the number of the policy section you are assessing.

Note on time limits: if the policy promises shorter times to react or respond to something than the law requires, eg the company responds within 30 days while 45 days is required by law, it is NOT a violation since sooner is better, and 30 is less than 45.

For example, if the policy is "The company's address is 123 Main Street, New York, NY 10001. The company's phone number is 212-555-1234." and the new law is "You have to specify the company's address, phone number, and email address.", then the missing part is "company's email", and so the policy is not fully compliant.

As another example, if the entire policy is "The company may collect the following types of personal information: name, address, phone number, email address, and credit card number." and the new law is "You have to specify a mechanism for consumers to delete their personal information.", then the missing part is "mechanism for consumers to delete their personal information", and so the policy is not fully compliant.

As another example, if the policy is "the company will respond to a verifiable consumer request within 30 days of its receipt" and the law is "the company is required to respond within 45 days of receiving a verifiable consumer request", then the policy is fully compliant because 30 is less than 45.
 """


In [13]:
# Take one section from the recroom policy
RecRoom_policy_splits = split_md_sections("data/RecRoom_complete.md")
policy_section = RecRoom_policy_splits[9]
print(policy_section)

# 9. Additional Notice for California Residents

The following applies to California residents pursuant to the California Consumer Privacy Act of 2018 ("CCPA"):

- In the preceding 12 months, the Company may have disclosed the categories of Personal Information listed in Section 2 above to our service providers listed in Section 6 above for business purposes.
- In the preceding 12 months, the Company has not sold Personal Information. The Company only discloses Personal Information to service providers.
- You have the right to request that the Company disclose certain information to you about our collection and use of your Personal Information over the past 12 months. Once we receive and confirm your verifiable consumer request, we will disclose to you, to the extent retained by us:
  - The categories of Personal Information we collected about you.
  - The categories of sources for the Personal Information we collected about you.
  - Our business or commercial purpose for collecting or

In [16]:
a9 = run_section_through_chain(pre_prompt_analysis_only, policy_section, retrieval_qa)
print(a9)

{
  "answer": "The policy section '# 9. Additional Notice for California Residents' is generally compliant with the new law. However, there are a few areas that need to be addressed to ensure full compliance:

1. The policy does not explicitly mention the obligation to inform all individuals responsible for handling consumer inquiries about the business’s privacy practices or the business’s compliance with the title, as required by Sections 1798.130.202012_a_3_B_iii_5_C_ii_6 and 1798.135.202013_c_3 of the legal code. This is a significant omission and should be addressed immediately.

2. The policy does not clearly state that the company will disclose and deliver the required information to a consumer free of charge, correct inaccurate personal information, or delete a consumer’s personal information, based on the consumer’s request, within 45 days of receiving a verifiable consumer request from the consumer, as required by Section 1798.130.202012_a_2 of the legal code. Although the po

In [17]:
# save inputs and outputs to disk
db_inputs_outputs.add_data({'query': pre_prompt_analysis_only,
                            'policy_descr': "Section 9 original",
                            'llm': "gpt-4, temperature 0",
                            'retriever': "Taylor retirever Similarity",
                            'answer': a9,
                            'notes': "",
                            "timestamp":  datetime.datetime.now()
                            })

# All sections of policy

For each section of the given policy, run it through the chain. Save answers with sources. Summarize it into a summary.

In [21]:
import pickle
import time

In [18]:
# get recroom policy splits
RecRoom_policy_splits = split_md_sections("data/RecRoom_complete.md")
print(len(RecRoom_policy_splits))
# escape problematic markdown characters
# RecRoom_policy_splits = [escape_markdown(x) for x in RecRoom_policy_splits]
# RecRoom_policy_splits = RecRoom_policy_splits[9:11]
print(RecRoom_policy_splits[0])

12
# Privacy Policy

Effective: July 25, 2023

Rec Room Inc., a Delaware corporation ("Company", "we", "our", and their derivatives) provides the websites, [http://www.recroom.com](http://www.recroom.com/), [http://www.rec.net](http://www.rec.net/), and the subdomains of each of the foregoing (collectively, the "Website") and the Rec Room® video game (the "Game" and, with the Website, the "Services").

This Policy sets forth how we collect, use, protect, store, and otherwise process your Personal Information (defined below). This Policy does NOT apply to information we collect offline or you provide to or is collected by any third party (except as otherwise provided below).

For our practices regarding children, please see the Children's section in Section 2 below.




In [None]:
# Loop over the splits
all_answers_list = []

for i, policy_section in enumerate(RecRoom_policy_splits):
    print(f"Processing chunk {i}...")
    output = run_section_through_chain(pre_prompt_analysis_only, policy_section, retrieval_qa)
    answer = output
    # append to all_answers
    all_answers_list.append(answer)
    # pause to avoid hitting the TPM limit. I've got the limit of 10KTPM-200RPM
    time.sleep(10)


# save the list to the disk
with open("data/all_answers_list.pkl", "wb") as f:
    pickle.dump(all_answers_list, f)


In [22]:
# concatenate the string from the list all_answers_list

# load from disk if needed
if 1:
    with open("data/all_answers_list.pkl", "rb") as f:
        all_answers_list = pickle.load(f)

all_answers = "\n".join([f"__Assessment of chunk {i}__\n{answer}\n" for i, answer in enumerate(all_answers_list)])

print(all_answers)

__Assessment of chunk 0__
{
  "answer": "The provided policy section does not comply with the new law. The violation is significant as it lacks several key elements required by the California Consumer Privacy Act of 2018. Here are the main issues, prioritized by importance:

1. The policy does not specify two or more designated methods for consumers to submit requests for information disclosure, deletion, or correction (Section 1798.130 (a)(1)).
2. The policy does not mention any measures to protect the fundamental privacy rights of natural persons with respect to the use of their personal information (Section 1798.185 (c)).
3. The policy does not provide guidance to consumers regarding their rights under this title (Section 1798.185 (e)).
4. The policy does not establish a mechanism for persons doing business in California to voluntarily certify that they are in compliance with this title (Section 1798.185 (j)).
5. The policy does not mention any measures to promote public awareness a

In [23]:
# create summarization model
llm_for_summary = ChatOpenAI(model_name='gpt-4')

In [26]:
# Summarize the sections
messages = [
    SystemMessage(content="ACT as a corporate attorney whose job is to check the compliance of the company's policy with the state's new laws.  "),
    HumanMessage(content="Summarize the assessments of different sections of the policy into a single assessment, referencing individual sections of the policy as necessary. Make sure to include the relevant numbers of legal code sections for the human attorney to be able to verify. The assessment of individual sections are listed below as __Assessment of chunk x__: \n" + all_answers + "\n *** End of chunks *** \n Summarize the assessment of of indivudual chunks. ")
]
summary_answer = llm_for_summary(messages)

In [27]:
print(summary_answer.content)

In summary, the overall assessment reveals that the company's policy fails to fully comply with the new law in multiple sections. Here are the main issues that need immediate attention:

1. The policy does not specify two or more designated methods for consumers to submit requests for information disclosure, deletion, or correction. This is a clear violation of Section 1798.130 (a)(1) of the California Consumer Privacy Act of 2018. 

2. The policy does not provide proper guidance to consumers regarding their rights under this title (Section 1798.185 (e) and lacks a mechanism for businesses operating in California to voluntarily certify their compliance with this title (Section 1798.185 (j)).

3. The policy does not specify the length of time the business intends to retain each category of personal information, as required by Section 1798.100 (3) and does not provide a clear mechanism for consumers to delete their personal information as required by Section 1798.105.

4. The policy does