### Package imports

In [1]:
from langchain_chroma import Chroma
from langchain_openai import AzureOpenAIEmbeddings
from langchain_core.documents import Document
from langchain_openai import AzureChatOpenAI
from langchain.chains import RetrievalQA
from utils.utils import pdf_bytes_to_text, clean_json_string, write_to_db
from dotenv import load_dotenv
import boto3
import json
import os
import uuid

### Configurations

In [2]:
load_dotenv()

True

### Create a collection

In [3]:
# Define the embedding model
embeddings = AzureOpenAIEmbeddings(
    model=os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME'),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION")
)

In [4]:
# Create the Chroma vector store
vector_store = Chroma(
    collection_name="test_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

### Store data in the collection

In [5]:
# Connect to S3
bucket_name = "thecrewo6newdocs"
s3 = boto3.resource("s3")
bucket = s3.Bucket(bucket_name)

In [6]:
# Define the system instructions for metadata tagging
with open("system_instructions/s3_metadata_tagging.txt", "r") as f:
    system_instructions = f.read()

In [7]:
# Connect to the Azure OpenAI GPT model
model = AzureChatOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    deployment_name=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
)

In [8]:
# Go through each document in the bucket
for obj in bucket.objects.all():
    print(f"Now working on: {obj.key}")
    obj_contents = obj.get()['Body'].read()

    if obj.key.split('.')[-1] == 'pdf':
        obj_contents = pdf_bytes_to_text(obj_contents)
    else:
        obj_contents = str(obj_contents)

    # Create the message for the LLM
    messages = [
        ("system", system_instructions),
        ("human", obj_contents),
    ]

    # Invoke the conversation
    response = model.invoke(messages)
    
    # Clean and update the JSON response
    response_clean = clean_json_string(response.content)
    response_json = json.loads(response_clean)
    response_json["path"] = obj.key
    print(response_json)

    # Create a document object
    document = Document(
        page_content=obj_contents,
        metadata=response_json,
    )

    # Store in the db
    doc_id = str(uuid.uuid4())
    vector_store.add_documents([document], ids=[doc_id])

    # Write data to a JSON file just for easy viewing
    write_to_db(response_json)

Now working on: CompliSpaceAssurance/Cyber Security/Cyber Security/Cyber Security Audit Checklist.json
{'title': 'Cyber Security Audit Checklist', 'summary': "A comprehensive checklist designed to assess and improve an organization's cyber security framework, including policies, risk management, incident response, and compliance.", 'price': 86.5, 'category': ['checklist'], 'path': 'CompliSpaceAssurance/Cyber Security/Cyber Security/Cyber Security Audit Checklist.json'}


ValueError: Expected metadata value to be a str, int, float or bool, got ['checklist'] which is a list in upsert.

Try filtering complex metadata from the document using langchain_community.vectorstores.utils.filter_complex_metadata.

### Perform similarity search

In [None]:
# Define the prompt
prompt = "What constitutes a cyber security risk?"

In [None]:
# Perform vector search
results = vector_store.similarity_search_with_score(prompt, k=5)

for result, score in results:
    print(f"* [SIM={score:3f}] [{result.metadata}]")

* [SIM=0.314717] [{'path': 'CompliSpaceAssurance/Cyber Security/Cyber Security/Risk Register/4 Cyber Security Risk Register Upload Spreadsheet.csv', 'price': 78.99, 'summary': 'A comprehensive guide to understanding various cyber security risks, their potential consequences, and effective management strategies.', 'title': 'Cyber Security Risks and Management'}]
* [SIM=0.434820] [{'path': 'CompliSpaceAssurance/Cyber Security/Cyber Security/Risk Register/3 Cyber Security Risk Register Order of Fields.pdf', 'price': 83.49, 'summary': 'This document outlines key risks, their descriptions, potential consequences, classifications, and control strategies within an organization. It also details responsible persons and business units involved in managing these risks.', 'title': 'Risk Management Document'}]
* [SIM=0.514152] [{'path': 'CompliSpaceAssurance/Cyber Security/Cyber Security/Risk Register/Cyber Security Possible Risk Controls and Treatments (1).docx', 'price': 72.99, 'summary': 'This i

### Prompt a model with the files as context

In [None]:
# Define the retriever function
similarity_threshold = 0.5
retriever = vector_store.as_retriever(search_type="similarity_score_threshold",
                                      search_kwargs={"k": 5, "score_threshold": similarity_threshold})

In [None]:
qa = RetrievalQA.from_chain_type(
    llm=model,
    chain_type="stuff", 
    retriever=retriever, 
    return_source_documents=True 
)

In [None]:
# Invoke the model for RAG
answer = qa.invoke({"query": prompt})
string_answer = answer['result']

chunks= []
for i in range(len(answer['source_documents'])):
    chunks.append(answer['source_documents'][i].metadata['path'])
chunks_unique = list(set(chunks))

print(f"Answer: {string_answer}\n")
print(f"Source documents: {chunks_unique}")

Answer: A cyber security risk generally refers to the potential for unauthorized access, exposure, manipulation, or destruction of information and systems within an organization. Here are some key components that constitute a cyber security risk:

1. **Unauthorized Access:** Gaining access to systems, networks, or data without permission, which can lead to data breaches and theft of sensitive information.

2. **Data Exposure:** Sensitive or personal information is accessed, disclosed, or exposed to unauthorized people, leading to data breaches and potentially compromising intellectual property or personal information.

3. **Data Manipulation:** Unauthorized changes or corruption of data that can affect the integrity and reliability of the information systems.

4. **Data Destruction:** Intentional deletion or destruction of data, which can disrupt business operations and lead to significant financial and reputational damage.

5. **Malicious Software (Malware):** Installation of maliciou