In [None]:
import zipfile
import os
from pinecone import Pinecone, ServerlessSpec
from langchain.document_loaders import JSONLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_text_splitters import RecursiveJsonSplitter
import json

from langchain.embeddings import HuggingFaceHubEmbeddings

# Step 1: Unzip the file
with zipfile.ZipFile('ezyzip-new.zip', 'r') as zip_ref:
    zip_ref.extractall('extracted_folder')

# Step 2: Set up embedding model
modelPath = "./all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Define a JSON splitter
json_splitter = RecursiveJsonSplitter(
    max_chunk_size=500,
)

# Step 3: Initialize Pinecone
index_name = "csye7125-project"
namespace = "default"
os.environ["PINECONE_API_KEY"] = "43c55300-0fa5-4208-8cf1-f885c29430ab"
pinecone = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Create an index if it does not exist
existing_indexes = [index.name for index in pinecone.list_indexes()]

if index_name not in existing_indexes:
    pinecone.create_index(name=index_name, dimension=384, metric="cosine",
                          spec=ServerlessSpec(cloud="aws", region="us-east-1"))
    print(f"Index {index_name} created.")

index = pinecone.Index(index_name)

db = PineconeVectorStore(
    index=index,
    namespace=namespace,
    embedding=embeddings
)

class CustomJSONLoader(JSONLoader):
    def load(self):
        documents = []
        try:
            documents = super().load()
        except Exception as e:
            print(f"Error loading file {self.file_path}: {e}. Skipping this file.")
            return []
        for doc in documents:
            if 'source' in doc.metadata:
                del doc.metadata['source']
        return documents

# Step 4: Walk through the unzipped files and process each JSON file
for root, dirs, files in os.walk('extracted_folder'):
    for file in files:
        if file.endswith('.json'):
            file_path = os.path.join(root, file)
            loader = CustomJSONLoader(
                file_path=file_path,
                jq_schema='''
                {
                  "cveMetadata": .cveMetadata,
                  "Descriptions": .containers.cna.descriptions,
                  "Metrics": .containers.cna.metrics
                }
                ''',
                text_content=False
            )
            json_data = loader.load()
            if json_data:  # Only process if data was successfully loaded
                print(f"Processing file: {file_path}")
                print(json_data)

                # Step 5: Store the embeddings in Pinecone
                db.add_documents(documents=json_data)

# Now the documents that could be loaded without errors are stored in the Pinecone vector database.
