### Package imports

In [1]:
from langchain_chroma import Chroma
from langchain_openai import AzureOpenAIEmbeddings
from langchain_core.documents import Document
from langchain_openai import AzureChatOpenAI
from langchain.chains import RetrievalQA
from utils.utils import pdf_bytes_to_text, clean_json_string, write_to_db, count_tokens
from dotenv import load_dotenv
import boto3
import json
import os
import uuid

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


### Configurations

In [2]:
load_dotenv()

True

### Create a collection

In [3]:
# Define the embedding model
embeddings = AzureOpenAIEmbeddings(
    model=os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME'),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION")
)

In [4]:
# Create the Chroma vector store
vector_store = Chroma(
    collection_name="test_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

### Store data in the collection

In [5]:
# Connect to S3
bucket_name = "thecrewo6docskeeper"
s3 = boto3.resource("s3")
bucket = s3.Bucket(bucket_name)

In [6]:
# Define the system instructions for metadata tagging
with open("system_instructions/s3_metadata_tagging.txt", "r") as f:
    system_instructions = f.read()

In [7]:
# Connect to the Azure OpenAI GPT model
model = AzureChatOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    deployment_name=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
)

In [8]:
# Go through each document in the bucket
for obj in bucket.objects.all():
    print(f"Now working on: {obj.key}")
    obj_contents = obj.get()['Body'].read()

    if obj.key.split('.')[-1] == 'pdf':
        obj_contents = pdf_bytes_to_text(obj_contents)
    else:
        obj_contents = str(obj_contents)

    # Create the message for the LLM
    messages = [
        ("system", system_instructions),
        ("human", obj_contents),
    ]

    # Calculate the token
    tokens = count_tokens(system_instructions + obj_contents)

    # Skip if the token is more than 128000
    if tokens >= 128000:
        continue

    # Invoke the conversation=
    response = model.invoke(messages)
    
    # Clean and update the JSON response
    response_clean = clean_json_string(response.content)
    response_json = json.loads(response_clean)
    response_json["path"] = obj.key
    print(response_json)

    # Create a document object
    document = Document(
        page_content=obj_contents,
        metadata=response_json,
    )

    # Store in the db
    doc_id = str(uuid.uuid4())
    vector_store.add_documents([document], ids=[doc_id])

    # Write data to a JSON file just for easy viewing
    write_to_db(response_json)

Now working on: CompliSpaceAssurance/HR/Internal Grievance Register.json
{'title': 'Internal Grievance Register', 'summary': 'This document provides a structured procedure for workers to lodge grievances, ensuring swift and fair resolution. It excludes specific issues covered by separate policies.', 'price': 78.45, 'has_checklist': True, 'has_policy': True, 'has_safe_operating_procedure': False, 'has_report': False, 'has_template': True, 'has_manual': False, 'path': 'CompliSpaceAssurance/HR/Internal Grievance Register.json'}
WRITING INTO DB: {'title': 'Internal Grievance Register', 'summary': 'This document provides a structured procedure for workers to lodge grievances, ensuring swift and fair resolution. It excludes specific issues covered by separate policies.', 'price': 78.45, 'has_checklist': True, 'has_policy': True, 'has_safe_operating_procedure': False, 'has_report': False, 'has_template': True, 'has_manual': False, 'path': 'CompliSpaceAssurance/HR/Internal Grievance Register.j

### Perform similarity search

In [9]:
# Define the prompt
prompt = "What constitutes a cyber security risk?"

In [10]:
# Perform vector search
results = vector_store.similarity_search_with_score(prompt, k=5)

for result, score in results:
    print(f"* [SIM={score:3f}] [{result.metadata}]")

* [SIM=1.150778] [{'has_checklist': False, 'has_manual': True, 'has_policy': True, 'has_report': True, 'has_safe_operating_procedure': False, 'has_template': True, 'path': 'CompliSpaceAssurance/Risk Register/Macro Risk Register.csv', 'price': 85.0, 'summary': 'This document contains risk descriptions, potential consequences, control policies, strategies, and responsible reporting measures for various organizational risks, including governance, compliance, workplace safety, and cyber security.', 'title': 'Risk Management Overview and Control Strategies'}]
* [SIM=1.232305] [{'has_checklist': True, 'has_manual': False, 'has_policy': True, 'has_report': True, 'has_safe_operating_procedure': False, 'has_template': False, 'path': 'CompliSpaceAssurance/Risk Register/Hazard Register.csv', 'price': 83.45, 'summary': 'Detailed documentation of workplace health and safety risks, their potential consequences, policies, and control strategies.', 'title': 'Comprehensive Workplace Health & Safety Ris

### Prompt a model with the files as context

In [17]:
# Define the retriever function
similarity_threshold = 0.0
retriever = vector_store.as_retriever(search_type="similarity_score_threshold",
                                      search_kwargs={"k": 5, "score_threshold": similarity_threshold})

In [18]:
qa = RetrievalQA.from_chain_type(
    llm=model,
    chain_type="stuff", 
    retriever=retriever, 
    return_source_documents=True 
)

In [19]:
# Invoke the model for RAG
answer = qa.invoke({"query": prompt})
string_answer = answer['result']

chunks= []
for i in range(len(answer['source_documents'])):
    chunks.append(answer['source_documents'][i].metadata['path'])
chunks_unique = list(set(chunks))

print(f"Answer: {string_answer}\n")
print(f"Source documents: {chunks_unique}")

  self.vectorstore.similarity_search_with_relevance_scores(


Answer: A **cyber security risk** constitutes any potential threat or vulnerability that could lead to unauthorized access, disruption, misuse, or damage to information systems and the data they protect. According to the provided context, cyber security risks can include:

- **Failure to Implement Effective Policies, Systems, and Practices:** Inadequate measures to identify, prevent, detect, and respond to cyber security incidents.
- **Potential Consequences of Cyber Security Risks:** Breaches of law, data loss, system failure, business disruption, civil litigation, and reputational harm.

These risks typically involve threats to the confidentiality, integrity, and availability of information and systems.

Source documents: ['CompliSpaceAssurance/Risk Register/Whistleblower Macro Risk Register upload spreadsheet (UPGRADE ONLY).csv', 'Devonway/Electrical Safety/bp 192 Risk-Assessment-Best-Practice192.pdf', 'CompliSpaceAssurance/Risk Register/Macro Risk Register.csv', 'CompliSpaceAssuran