In [None]:
!pip install langchain langchain_community langchain_milvus sentence-transformers

In [3]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.document_loaders import CSVLoader

# Create a WebBaseLoader instance to load documents from web sources
loader = CSVLoader("/data/cve_data/csv/originalallitems.csv")

# Load documents from web sources using the loader
documents = loader.load()
# Initialize a RecursiveCharacterTextSplitter for splitting text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)

# Split the documents into chunks using the text_splitter
docs = text_splitter.split_documents(documents)

# Let's take a look at the first document
docs[1]

Document(metadata={'source': '/data/cve_data/csv/originalallitems.csv', 'row': 1}, page_content='Name: CVE-1999-0002\nStatus: Entry\nDescription: Buffer overflow in NFS mountd gives root access to remote attackers, mostly in Linux systems.\nReferences: BID:121   |   URL:http://www.securityfocus.com/bid/121   |   CERT:CA-98.12.mountd   |   CIAC:J-006   |   URL:http://www.ciac.org/ciac/bulletins/j-006.shtml   |   SGI:19981006-01-I   |   URL:ftp://patches.sgi.com/support/free/security/advisories/19981006-01-I   |   XF:linux-mountd-bo\nPhase: \nVotes: \nComments:')

In [4]:
from langchain_milvus import Milvus, Zilliz
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

vectorstore = Milvus.from_documents( 
    documents=docs,
    embedding=embedding_function,
    connection_args={
        "uri": "./milvus_demo.db",
    },
    drop_old=True,  # Drop the old Milvus collection if it exists
)


In [6]:
from langchain_milvus import Milvus, Zilliz
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

vectorstore = Milvus(
    connection_args={
        "uri": "./milvus_demo.db",
    },
    embedding_function=embedding_function
)



In [7]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_community.chat_models import ChatOllama

# Prompt
prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are an AI assistant, and provides answers to questions by using fact based and statistical information when possible.
Provide the list of vulnerabilities associated with the CPEs: <question>.
If you don't know, just say that you don't know, don't try to make up an answer. <|eot_id|><|start_header_id|>user<|end_header_id|>
    
    Question: {question} 
    Context: {context} 
    
    The response should be specific and use statistics or numbers when possible.
    
    Answer: <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
    input_variables=["question", "document"],
)

local_llm = "llama3"
llm = ChatOllama(model=local_llm, temperature=0)


# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


retriever = vectorstore.as_retriever()


  llm = ChatOllama(model=local_llm, temperature=0)


In [8]:
# Define the RAG (Retrieval-Augmented Generation) chain for AI response generation
from langchain_core.runnables import RunnablePassthrough
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# rag_chain.get_graph().print_ascii()

# Invoke the RAG chain with a specific question and retrieve the response
query = "cpe:2.3:h:*:5400rmp_oem_harddrive:-:*:*:*:*:*:*:*"
res = rag_chain.invoke(query)
res


'Based on the provided CPEs, I can provide you with a list of vulnerabilities associated with them:\n\n1. cpe:2.3:h:*:5400rmp_oem_harddrive:-:*:*:*:*:*:*:*\n\t* CVE-2024-33218: A privilege escalation vulnerability in the AsUpIO64.sys component of ASUS USB 3.0 Boost Storage Driver 5.30.20.0, allowing attackers to execute arbitrary code via crafted IOCTL requests.\n\nStatistics:\n\n* Number of vulnerabilities: 1\n* Severity: High (privilege escalation and arbitrary code execution)\n\n2. cpe:2.3:h:*:5400rmp_oem_harddrive:-:*:*:*:*:*:*:*\n\t* CVE-2020-28419: A vulnerability in certain driver software or application packages, allowing arbitrary code execution during installation.\n\nStatistics:\n\n* Number of vulnerabilities: 1\n* Severity: High (arbitrary code execution)\n\n3. cpe:2.3:h:*:5400rmp_oem_harddrive:-:*:*:*:*:*:*:*\n\t* CVE-2020-27339: A vulnerability in InsydeH2O 5.x, allowing callers to corrupt firmware or OS memory by not validating CommBuffer and CommBufferSize parameters.\n