In [None]:
%reload_kedro

In [6]:
from kedro_workbench.extras.datasets.MongoDataset import MongoDBDocs
from llama_index.storage.docstore import MongoDocumentStore

from llama_index import (VectorStoreIndex, ServiceContext, Document, SummaryIndex,load_index_from_storage)
from llama_index.callbacks import CallbackManager, LlamaDebugHandler, TokenCountingHandler
from llama_index.embeddings import OpenAIEmbedding
from llama_index.llms import OpenAI
from llama_index.node_parser import SimpleNodeParser, SentenceWindowNodeParser
from llama_index.schema import MetadataMode
from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.storage.index_store import SimpleIndexStore, MongoIndexStore
from llama_index.storage.storage_context import StorageContext
from llama_index.text_splitter import TokenTextSplitter, SentenceSplitter
from llama_index import set_global_service_context
from llama_index.indices.postprocessor import (MetadataReplacementPostProcessor, LongContextReorder, LLMRerank, FixedRecencyPostprocessor, TimeWeightedPostprocessor, AutoPrevNextNodePostprocessor)
from llama_index.retrievers import VectorIndexRetriever, VectorIndexAutoRetriever
from llama_index.indices.query.schema import QueryBundle
from llama_index.vector_stores.types import MetadataInfo, VectorStoreInfo
from llama_index.vector_stores import ChromaVectorStore
from llama_index.vector_stores.types import ExactMatchFilter, MetadataFilters
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from chromadb.utils import embedding_functions
import chromadb
import openai
import tiktoken
from pprint import pprint
from IPython.display import Markdown, display
from datetime import datetime
import random
import pandas as pd
llama_debug = LlamaDebugHandler(print_trace_on_end=True)
token_counter = TokenCountingHandler(tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode)
callback_manager = CallbackManager([llama_debug, token_counter])

In [9]:
collection_names = [
        "msrc_security_update", "windows_10", "windows_11", "windows_update", 
        "stable_channel_notes", "security_update_notes", "mobile_stable_channel_notes", 
        "beta_channel_notes", "archive_stable_channel_notes", "patch_management"
    ]

In [10]:
index_infos = {
        'vector_index': {collection: '' for collection in collection_names},
        'summary_index': {collection: '' for collection in collection_names}
    }

In [11]:
index_infos['summary_index']["msrc_security_update"] = "Summary Index (sequential list of documents) version of Important announcements, updates, and fixes for microsoft secruity updates. A high priority, frequently updated collection of resources for diagnosing and resolving security vulnerabilities."
index_infos['summary_index']["windows_10"] = "Summary Index (sequential list of documents) version of Annoucements and updates related to Windows 10 issues, products and services. Contains announcements for bugs and errors as well as workarounds related to Windows 10."
index_infos['summary_index']["windows_11"] = "Summary Index (sequential list of documents) version of Annoucements and updates related to Windows 11 issues, products and services. Contains announcements for bugs and errors as well as workarounds related to Windows 11."
index_infos['summary_index']["windows_update"] = "Summary Index (sequential list of documents) version of A small collection of issues, bugs, errors, and solutions published by microsoft support. Not as in depth as other collections."
index_infos['summary_index']["stable_channel_notes"] = "Summary Index (sequential list of documents) version of Microsoft Edge Release Notes for the Stable version channel of annoucenments and updates."
index_infos['summary_index']["security_update_notes"] = "Summary Index (sequential list of documents) version of Microsoft Edge Release Notes specifically covering security updates and fixes."
index_infos['summary_index']["mobile_stable_channel_notes"] = "Summary Index (sequential list of documents) version of Microsoft Edge Release Notes for the Mobile Stable version channel of annoucenments and updates."
index_infos['summary_index']["beta_channel_notes"] = "Summary Index (sequential list of documents) version of Microsoft Edge Release Notes for the Beta version (not yet in the stable version) channel. Features and capabilities in development and open to the public for testing with known limitations or issues."
index_infos['summary_index']["archive_stable_channel_notes"] = "Summary Index (sequential list of documents) version of Microsoft Edge Release Notes for the Archive channel. Once notes are 6-12 months old or they are resolved, they get recategorized to the archive channel. Search here when 'published' date is older than 12 months."
index_infos['summary_index']["patch_management"] = "Summary Index (sequential list of documents) version of The conversations between expert microsoft system admins and architects. Text is noisy, unstructured and often contains email signitures unrelated to the topic. source material covers the widest range of topics. May contain fixes to security issues or bugs."

In [12]:
index_infos['vector_index']["msrc_security_update"] = VectorStoreInfo(
        content_info="Vector Index (semantic search) for posts published by Microsoft Security Response Center (msrc) for all security vulnerabilities affecting Microsoft products and services, and provides a guide to help manage and mitigate security risks. A high priority, frequently updated collection of resources for diagnosing and resolving security vulnerabilities.",
        metadata_info=[
            MetadataInfo(
                name="source",
                type="str",
                description="The source URL of the document. A unique URL for each document.",
            ),
            MetadataInfo(
                name="published",
                type="str",
                description="The date that the document or revision was officially published at the source URL. format 'dd-mm-yyyy'",
            ),
            MetadataInfo(
                name="revision",
                type="str",
                description="These documents are updated after publication date and the revision number indicates the version of the document. All revision numbers start at 1.",
            ),
            MetadataInfo(
                name="post_id",
                type="str",
                description="All posts have a unique post_id of the format 'CVE-2023-12345' extracted from the source url. The post title and source url contain the post_id.",
            ),
            MetadataInfo(
                name="title",
                type="str",
                description="The title of the post. The title often contains the post_id",
            ),
            MetadataInfo(
                name="description",
                type="str",
                description="Some posts contain a short description.",
            ),
            MetadataInfo(
                name="collection",
                type="str",
                description="This field is a constant value that is the same for all documents in this vector store, 'msrc_security_update'.",
            ),
        ],)
index_infos['vector_index']["windows_10"] = VectorStoreInfo(
    content_info="Vector Index (semantic search) for documents published by Microsoft Support for annoucements and updates related to Windows 10 issues, products and services. Contains announcements for bugs and errors as well as workarounds related to Windows 10.",
    metadata_info=[
        MetadataInfo(
            name="source",
            type="str",
            description="The source URL of the document. A unique URL for each document. There are no tracking of revisions in this collection.",
        ),
        MetadataInfo(
            name="published",
            type="str",
            description="The date that the document was officially published at the source URL. format 'dd-mm-yyyy'",
        ),
        MetadataInfo(
            name="post_id",
            type="str",
            description="All posts have a unique post_id of the format 'kb4520412' extracted from the source url.",
        ),
        MetadataInfo(
            name="title",
            type="str",
            description="The title of the post.",
        ),
        MetadataInfo(
            name="description",
            type="str",
            description="Some posts contain a short description.",
        ),
        MetadataInfo(
            name="collection",
            type="str",
            description="This field is a constant value that is the same for all documents in this vector store, 'windows_10'.",
        ),
    ],)
index_infos['vector_index']["windows_11"] = VectorStoreInfo(
    content_info="Vector Index (semantic search) for documents published by Microsoft Support for annoucements and updates related to Windows 11 issues, products and services. Contains announcements for bugs and errors as well as workarounds related to Windows 11.",
    metadata_info=[
        MetadataInfo(
            name="source",
            type="str",
            description="The source URL of the document. A unique URL for each document. There are no tracking of revisions in this collection.",
        ),
        MetadataInfo(
            name="published",
            type="str",
            description="The date that the document was officially published at the source URL. format 'dd-mm-yyyy'",
        ),
        MetadataInfo(
            name="post_id",
            type="str",
            description="All posts have a unique post_id of the format of a UUID '87a81f8a-78fa-456e-b521-ac0560e32338' extracted from the source url.",
        ),
        MetadataInfo(
            name="title",
            type="str",
            description="The title of the post. The title often contains the post_id",
        ),
        MetadataInfo(
            name="description",
            type="str",
            description="Some posts contain a short description.",
        ),
        MetadataInfo(
            name="collection",
            type="str",
            description="This field is a constant value that is the same for all documents in this vector store, 'windows_11'.",
        ),
    ],)                                                            
index_infos['vector_index']["windows_update"] = VectorStoreInfo(
    content_info="Vector Index (semantic search) for documents published by Microsoft Support for of a small collection of issues, bugs, errors, and solutions. Not as in depth or comprehensive as other collections.",
    metadata_info=[
        MetadataInfo(
            name="source",
            type="str",
            description="The source URL of the document. A unique URL for each document. There are no tracking of revisions in this collection.",
        ),
        MetadataInfo(
            name="published",
            type="str",
            description="The date that the document was officially published at the source URL. format 'dd-mm-yyyy'",
        ),
        MetadataInfo(
            name="post_id",
            type="str",
            description="All posts have a unique post_id in the format of a UUID '87a81f8a-78fa-456e-b521-ac0560e32338' extracted from the source url.",
        ),
        MetadataInfo(
            name="title",
            type="str",
            description="The title of the post. The title often contains the post_id",
        ),
        MetadataInfo(
            name="description",
            type="str",
            description="Some posts contain a short description.",
        ),
        MetadataInfo(
            name="collection",
            type="str",
            description="This field is a constant value that is the same for all documents in this vector store, 'windows_update'.",
        ),
    ],)
index_infos['vector_index']["stable_channel_notes"] = VectorStoreInfo(
    content_info="Vector Index (semantic similarity between documents and query) for documents published by Microsoft specifically for Microsoft Edge Release Notes for the Stable version These notes provide information about new features and non-security updates. All the Microsoft Edge security updates are listed in Release notes for Microsoft Edge Security Updates.",
    metadata_info=[
        MetadataInfo(
            name="source",
            type="str",
            description="The source URL of the document. A unique URL for each document. There are no tracking of revisions in this collection.",
        ),
        MetadataInfo(
            name="published",
            type="str",
            description="The date that the document was officially published at the source URL. format 'dd-mm-yyyy'",
        ),
        MetadataInfo(
            name="subject",
            type="str",
            description="These posts are updates therefore their subjects are a date and version number. eg., 'Version 118.0.2088.61: October 20, 2023'.",
        ),
        MetadataInfo(
            name="collection",
            type="str",
            description="This field is a static value that is the same for all documents in this vector store, 'stable_channel_notes'.",
        ),
    ],)
index_infos['vector_index']["security_update_notes"] = VectorStoreInfo(
    content_info="Vector Index (semantic similarity between documents and query) for documents published by Microsoft for Microsoft Edge Release Notes specifically covering security updates and fixes that are mentioned in the Stable Channel documents.",
    metadata_info=[
        MetadataInfo(
            name="source",
            type="str",
            description="The source URL of the document. A unique URL for each document. There are no tracking of revisions in this collection.",
        ),
        MetadataInfo(
            name="published",
            type="str",
            description="The date that the document was officially published at the source URL. format 'dd-mm-yyyy'",
        ),
        MetadataInfo(
            name="subject",
            type="str",
            description="The subject's of these posts tend to be date strings 'October 20, 2023'.",
        ),
        MetadataInfo(
            name="collection",
            type="str",
            description="This field is a static value that is the same for all documents in this vector store, 'security_update_notes'.",
        ),
    ],)
index_infos['vector_index']["mobile_stable_channel_notes"] = VectorStoreInfo(
    content_info="Vector Index (semantic search) for documents published by Microsoft specifically for information about new features that are available to work or school accounts, and non-security updates that are included in the Microsoft Edge for Mobile Stable Channel.",
    metadata_info=[
        MetadataInfo(
            name="source",
            type="str",
            description="The source URL of the document. A unique URL for each document. There are no tracking of revisions in this collection.",
        ),
        MetadataInfo(
            name="published",
            type="str",
            description="The date that the document was officially published at the source URL. format 'dd-mm-yyyy'",
        ),
        MetadataInfo(
            name="collection",
            type="str",
            description="This field is a static value that is the same for all documents in this vector store, 'mobile_stable_channel_notes'.",
        ),
    ],)
index_infos['vector_index']["beta_channel_notes"] = VectorStoreInfo(
    content_info="Vector Index (semantic similarity between documents and query) for documents published by Microsoft specifically for the Beta version (not yet in the stable version) channel. New Features and non-secruity updates.",
    metadata_info=[
        MetadataInfo(
            name="source",
            type="str",
            description="The source URL of the document. A unique URL for each document. There are no tracking of revisions in this collection.",
        ),
        MetadataInfo(
            name="published",
            type="str",
            description="The date that the document was officially published at the source URL. format 'dd-mm-yyyy'",
        ),
        MetadataInfo(
            name="collection",
            type="str",
            description="This field is a static value that is the same for all documents in this vector store, 'beta_channel_notes'.",
        ),
    ],)
index_infos['vector_index']["archive_stable_channel_notes"] = VectorStoreInfo(
    content_info="Vector Index (semantic search) for all Microsoft Edge Release Notes that have been archived. If a Microsoft Edge Stable Channel note can't be found in that collection, check this collection.",
    metadata_info=[
        MetadataInfo(
            name="source",
            type="str",
            description="The source URL of the document. A unique URL for each document. There are no tracking of revisions in this collection.",
        ),
        MetadataInfo(
            name="published",
            type="str",
            description="The date that the document was officially published at the source URL. format 'dd-mm-yyyy'",
        ),
        MetadataInfo(
            name="collection",
            type="str",
            description="This field is a static value that is the same for all documents in this vector store, 'archive_stable_channel_notes'.",
        ),
    ],)
index_infos['vector_index']["patch_management"] = VectorStoreInfo(
    content_info="Vector Index (semantic search) of the conversations posted by microsoft system administrators to the google group 'patch management'. Text is noisy, unstructured and often contains email signitures unrelated to the topic. source material covers the widest range of topics. May contain fixes or workarounds to security issues or problems with patching systems.",
    metadata_info=[
        MetadataInfo(
            name="receivedDateTime",
            type="str",
            description="The date & time that the message was published to the google group.",
        ),
        MetadataInfo(
            name="subject",
            type="str",
            description="The subject of the message posted to the google group. Many posts share the same subject to indicate they are part of the same thread. Use receivedDateTime to differentiate posts.",
        ),
        MetadataInfo(
            name="topic",
            type="str",
            description="Text extracted from the subject without message and thread tags. Example, subject = 'AW: [patchmanagement] October patches' and topic = 'October patches' ",
        ),
        MetadataInfo(
            name="published",
            type="str",
            description="The date that the document was ingested into the vector store index. format 'dd-mm-yyyy'",
        ),
        MetadataInfo(
            name="collection",
            type="str",
            description="This field is a static value that is the same for all documents in this vector store, 'patch_management'.",
        ),
    ],)

In [6]:
openai.api_key="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"

In [7]:
llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo", max_tokens=2500)

In [8]:
embed_model = OpenAIEmbedding(embed_batch_size=50)

In [9]:
# Chunks are for Summary Index which is just a sequential list of documents, so there is no need for overlap.
sentence_text_splitter = SentenceSplitter(
        separator=" ",
        chunk_size=2048,
        chunk_overlap=0,
        paragraph_separator="\n\n",
        secondary_chunking_regex="[^,.;。]+[,.;。]?",
        tokenizer=tiktoken.encoding_for_model("text-davinci-002").encode
    )

In [10]:
node_parser = SimpleNodeParser.from_defaults(include_metadata=True, include_prev_next_rel=True, text_splitter=sentence_text_splitter)

## Begin Load Summary Indices

In [11]:
summary_index_docstores = {}
for item in collection_names:
    #print(f"creating collection: {item}")
    summary_index_docstores[item] = MongoDocumentStore.from_uri(db_name="report_docstore", 
                                                                namespace=f"summary_index_docstore_{item}", 
                                                                uri="mongodb+srv://bhg_datascience_100:E4NZJ8z3Bl08JMSB@bighatcluster.wamzrdr.mongodb.net/")

print(f"{len(summary_index_docstores.keys())} docstores of type {type(summary_index_docstores['windows_10'])}")

10 docstores of type <class 'llama_index.storage.docstore.mongo_docstore.MongoDocumentStore'>


In [12]:
summary_index_stores = {}
for item in collection_names:
    #print(f"creating collection: {item}")
    summary_index_stores[item] = MongoIndexStore.from_uri(db_name="report_docstore", 
                                                                namespace=f"summary_index_store_{item}", 
                                                                uri="mongodb+srv://bhg_datascience_100:E4NZJ8z3Bl08JMSB@bighatcluster.wamzrdr.mongodb.net/")

print(f"{len(summary_index_stores.keys())} index stores of type {type(summary_index_stores['windows_10'])}")

10 index stores of type <class 'llama_index.storage.index_store.mongo_index_store.MongoIndexStore'>


In [13]:
summary_index_storage_contexts = {}
for item in collection_names:
    summary_index_storage_contexts[item] = StorageContext.from_defaults(docstore=summary_index_docstores[item], index_store=summary_index_stores[item])
print(f"{len(summary_index_storage_contexts.keys())} storage_contexts of type {type(summary_index_storage_contexts['windows_10'])}")

10 storage_contexts of type <class 'llama_index.storage.storage_context.StorageContext'>


In [14]:
service_context = ServiceContext.from_defaults(embed_model=embed_model,
                                               callback_manager=callback_manager,
                                               node_parser=node_parser)

In [15]:
summary_indicies = {}
for item in collection_names:
    summary_indicies[item] = load_index_from_storage(storage_context=summary_index_storage_contexts[item], service_context=service_context)

**********
Trace: index_construction
**********
**********
Trace: index_construction
**********
**********
Trace: index_construction
**********
**********
Trace: index_construction
**********
**********
Trace: index_construction
**********
**********
Trace: index_construction
**********
**********
Trace: index_construction
**********
**********
Trace: index_construction
**********
**********
Trace: index_construction
**********
**********
Trace: index_construction
**********


## Begin load Vector Stores

In [None]:
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=openai.api_key,
                model_name="text-embedding-ada-002"
            )

In [94]:
vectordb = chromadb.PersistentClient(path="C:/projects/technical-notes-llm-report/data/06_models/chroma_db")

In [95]:
chroma_collections = {}
for item in collection_names:
    #print(f"creating collection: {item}")
    chroma_collections[item] = vectordb.get_or_create_collection(name=item, embedding_function=openai_ef)
    #collections[i] = db_for_multicollections.get_or_create_collection(item)
print(f"{len(chroma_collections.keys())} collections of type {type(chroma_collections['windows_10'])}")

10 collections of type <class 'chromadb.api.models.Collection.Collection'>


In [96]:
vector_stores = {}
for item in collection_names:
    vector_stores[item] = ChromaVectorStore(chroma_collection=chroma_collections[item])
print(f"{len(vector_stores.keys())} vector_stores of type {type(vector_stores['windows_10'])}")

10 vector_stores of type <class 'llama_index.vector_stores.chroma.ChromaVectorStore'>


In [97]:
# run if not loading Summary Indices
service_context = ServiceContext.from_defaults(embed_model=embed_model,
                                               callback_manager=callback_manager,
                                               node_parser=node_parser)

In [98]:
# Required when creating
# not clear if needed when loading
vector_store_storage_contexts = {}
for item in collection_names:
    vector_store_storage_contexts[item] = StorageContext.from_defaults(vector_store=vector_stores[item])
print(f"{len(vector_store_storage_contexts.keys())} storage_contexts of type {type(vector_store_storage_contexts['windows_10'])}")

10 storage_contexts of type <class 'llama_index.storage.storage_context.StorageContext'>


In [99]:
vector_store_indicies = {}
for item in collection_names:
    vector_store_indicies[item] = VectorStoreIndex.from_vector_store(vector_store=vector_stores[item], 
                                                                     service_context=service_context)

**********
Trace: index_construction
**********
**********
Trace: index_construction
**********
**********
Trace: index_construction
**********
**********
Trace: index_construction
**********
**********
Trace: index_construction
**********
**********
Trace: index_construction
**********
**********
Trace: index_construction
**********
**********
Trace: index_construction
**********
**********
Trace: index_construction
**********
**********
Trace: index_construction
**********


## Node Postprocessors

In [18]:
metadata_replace = MetadataReplacementPostProcessor(target_metadata_key="window",)

In [19]:
long_reorder = LongContextReorder()

In [20]:
llm_rerank = LLMRerank(
    top_n=5, 
    service_context=service_context,
)

In [21]:
fixed_recency = FixedRecencyPostprocessor(
    top_k=3,
    date_key="published",  # the key in the metadata to find the date
    service_context=service_context,
)

In [22]:
time_weight = TimeWeightedPostprocessor(
    time_decay=0.99, 
    top_k=1,
)

In [None]:
auto_prev_next = AutoPrevNextNodePostprocessor(
  docstore=index.docstore,
  service_context=service_context,
  num_nodes=1,  # number of nodes to fetch when looking forawrds or backwards
)

In [None]:
node_postprocessors = [metadata_replace]

In [35]:
filters = MetadataFilters(filters=[ExactMatchFilter(key="source", value="https://msrc.microsoft.com/update-guide/vulnerability/CVE-2023-4351")])

In [69]:
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=openai.api_key,
                model_name="text-embedding-ada-002"
            )

In [70]:
#chroma_collection = chroma_collections['msrc_security_update']
collection = vectordb.get_collection(
    name='msrc_security_update',
    embedding_function=openai_ef)

In [71]:
data = collection.query(query_texts = 'Chromium: CVE-2023-4351 Use after free in Network', 
                               n_results=5, 
                               where_document={'$contains': 'CVE-2023-4351'}, 
                               include=['metadatas', 'distances'])

In [None]:
print(data)

## Begin node postprocessor experiments
Metadatareplacement is used in all query_engines because the SentenceWindowNodeParser was used to chunk documents

In [89]:
filters = MetadataFilters(filters=[ExactMatchFilter(key="source", value="https://msrc.microsoft.com/update-guide/vulnerability/CVE-2023-36898")])

In [101]:
#retriever = VectorIndexRetriever(
#    index=vector_store_indicies['msrc_security_update'],
#   similarity_top_k=5,
#    filters=filters
#)
retriever = VectorIndexRetriever(
    index=vector_store_indicies['msrc_security_update'],
    similarity_top_k=5,
)

In [102]:
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[metadata_replace]
)

In [105]:
response = query_engine.query(
    "explain in 2 paragraphs and a bullet list 'CVE-2023-4351' including the source URL and if there is a fix available",
)

In [106]:
print(response)
for item in response.source_nodes:
    print(item.node.metadata)
    print(item.node.get_text())

I'm sorry, but I couldn't find any information about "CVE-2023-4351" in the provided context. It is possible that the given context does not include any information about this specific CVE. I recommend checking other reliable sources or security databases for more information about CVE-2023-4351 and its fix availability.
{'revision': '1.0000000000', 'published': '11-07-2023', 'source': 'https://msrc.microsoft.com/update-guide/vulnerability/CVE-2023-33171', 'category': 'CVE', 'collection': 'msrc_security_update', 'added_to_vector_store': True, 'keywords': 'the Confidentiality, Integrity and Authentication, CVE-2023-33171 Microsoft Dynamics 365, CVSS:3.1 8.2 / 7.1 \ue946 Base, Microsoft Dynamics 365, CVE-2023-33171, Specialized, Network, \ue712 ', 'cve_fixes': '', 'cve_mentions': 'CVE-2023-33171', 'tags': '', 'window': 'The vulnerability may later be corroborated by research which suggests where the vulnerability may lie, though the research may not be certain.  Finally, a vulnerability 

In [None]:
#pprint(f"metamode embed\n{nodes[0].get_content(metadata_mode=MetadataMode.EMBED)}\n")

## VectorStoreAutoRetriever
Auto Retrievers use the LLM to decide which data to use within a vector store to improve the quality of the response

Must provide VectorIndexInfo class instance to Auto Retriever

In [None]:
retriever = VectorIndexAutoretriever(index, vector_store_info=vector_store_info)