In [5]:
import torch
import transformers
import sentence_transformers
import langchain

print("PyTorch version:", torch.__version__)
print("Transformers version:", transformers.__version__)
print("Sentence Transformers version:", sentence_transformers.__version__)
print("LangChain version:", langchain.__version__)

PyTorch version: 2.4.0
Transformers version: 4.44.0
Sentence Transformers version: 3.0.1
LangChain version: 0.2.14


In [13]:
import os
import json
from langchain.schema import Document

# Update this path to the directory where your CVE JSON files are located
cve_data_path = 'F:\\cvelistV5-main\\cvelistV5-main\\cves\\1999'

cve_documents = []
for root, dirs, files in os.walk(cve_data_path):
    for file in files:
        if file.endswith('.json'):
            with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                cve_content = json.load(f)
                
                # Extract only the relevant fields
                cve_metadata = cve_content.get('cveMetadata', {})
                containers = cve_content.get('containers', {}).get('cna', {})

                # Get the relevant fields
                cve_id = cve_metadata.get('cveId', 'N/A')
                date_published = cve_metadata.get('datePublished', 'N/A')
                date_updated = cve_metadata.get('dateUpdated', 'N/A')
                descriptions = containers.get('descriptions', [])
                affected = containers.get('affected', [])

                # Structure the extracted data
                relevant_data = {
                    'cveId': cve_id,
                    'datePublished': date_published,
                    'dateUpdated': date_updated,
                    'descriptions': descriptions,
                    'affected': affected
                }

                # Create Document with page_content and metadata
                cve_documents.append(Document(page_content=json.dumps(relevant_data), metadata={"source": file}))

data = cve_documents

# Print the number of documents loaded
print(len(data))

1579


In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split data into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000) # Adjust chunk size to avoid sequence length issues
docs = text_splitter.split_documents(data)

print("Total number of chunks:", len(docs))

Total number of chunks: 1579


In [15]:
from pinecone import Pinecone
from langchain_huggingface import HuggingFaceEmbeddings

# Initialize Pinecone with the API key
pc = Pinecone(api_key="xxxxxx")

# Connect to the existing Pinecone index
index = pc.Index("cve-index")

# All commented out portions are one time operations - like creating a Pinecone index, upserting the vector embeddings into the index

# pc.create_index(
#     name='cve-index',
#     dimension=768,
#     metric='euclidean',
#     deletion_protection='enabled',
#     spec=ServerlessSpec(
#         cloud='aws',
#         region='us-east-1'
#     )
# )

# Testing connection to index by listing indexes - output: cve-index
for idx in pc.list_indexes():
    print(idx['name'])

# Initialize the embeddings model - using sentence-transformers from Transformers library of HuggingFace
embeddings = HuggingFaceEmbeddings()

# Use only a smaller subset of documents for testing
# small_docs = docs[:20]

# Convert the small subset of documents into embeddings and upsert them into the Pinecone index
# vectors = [embeddings.embed_query(doc.page_content) for doc in small_docs]
# upsert_data = [(str(i), vectors[i], {"text": doc.page_content, **doc.metadata}) for i, doc in enumerate(small_docs)]

# Upsert the vectors into Pinecone
# upsert_response = index.upsert(
#     vectors=upsert_data,
#     # namespace="cve-namespace"
# )

# Function to answer any question using Pinecone index
def answer_question(question):
    # Convert the question into an embedding
    question_embedding = embeddings.embed_query(question)
    
    # Perform a similarity search in the Pinecone index
    query_response = index.query(
        vector=question_embedding,
        top_k=5,  # Adjust top_k to the number of results you want
        include_values=True,
        include_metadata=True
    )
    
    # Print the most relevant document (or chunk) found
    print("Top relevant document:", query_response['matches'][0]['metadata'])

# Example interaction
question = "via dtappgather program in CD"
answer_question(question)

cve-index
Top relevant document: {'source': 'CVE-1999-0014.json'}


In [18]:
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import HuggingFaceHubEmbeddings
from langchain_huggingface import HuggingFaceEndpointEmbeddings

# Initialize the LLM with Groq API
llm = ChatGroq(
    model="llama3-70b-8192",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key="xxxxxx"
)

# Define a prompt template
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you do not know the answer.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

# Create a retriever using the Pinecone vector store - it fetches the relevant documents based on similarity search.
db = PineconeVectorStore(
    index=index,
    embedding=embeddings
)
retriever = db.as_retriever()

# Create a RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

# Function to generate a response using the LLM
def answer_question(prompt):
    try:
        # When qa_chain.invoke(prompt) method is called, the 'prompt' is converted into vector embeddings within the RetrievalQA chain. 
        # Then a similairity search is conducted by the retriever within the Pinecone index to find results that are similar to the prompt. 
        # These search results are known as 'context'.
        # The QA_CHAIN_PROMPT template is used to combine the 'context' and the original 'prompt'.
        # This constructed prompt is sent to LLM to generate a response.
        response = qa_chain.invoke(prompt)
    except Exception as e:
        response = f"Error: {e}"
    return response

# Example interaction
print(answer_question("What is the vulnerability related to dtappgather program in CD?"))

{'query': 'What is the vulnerability related to dtappgather program in CD?', 'result': 'According to the provided context, the vulnerability related to the dtappgather program in CDE is unauthorized privileged access or denial of service.'}
