In [None]:
print("Yes!")

In [None]:
pip install pinecone-client openai langchain PyPDF2

In [None]:
import os
from typing import List
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from pinecone import Pinecone, ServerlessSpec
import json
import hashlib

In [23]:
class PdfToPineconeProcessor:
    def __init__(self, openai_api_key: str, pinecone_api_key: str, index_name: str):
        """
        Initialize the processor with necessary API keys and configurations.
        
        Args:
            openai_api_key (str): OpenAI API key for generating embeddings
            pinecone_api_key (str): Pinecone API key for vector database
            index_name (str): Name of the Pinecone index to use
        """
        self.openai_api_key = openai_api_key
        self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
        self.pc = Pinecone(api_key=pinecone_api_key)
        self.index_name = index_name
        
        # Initialize text splitter with specific parameters
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )

    def create_index_if_not_exists(self, dimension: int = 1536):
        """
        Create a Pinecone index if it doesn't already exist.
        
        Args:
            dimension (int): Dimension of the vectors (1536 for OpenAI embeddings)
        """
        if self.index_name not in self.pc.list_indexes().names():
            self.pc.create_index(
                name=self.index_name,
                dimension=dimension,
                metric='cosine',
                spec=ServerlessSpec(
                    cloud='aws',
                    region='us-west-2'
                )
            )

    def extract_text_from_pdf(self, pdf_path: str) -> str:
        """
        Extract text content from a PDF file.
        
        Args:
            pdf_path (str): Path to the PDF file
            
        Returns:
            str: Extracted text content
        """
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text

    def split_text(self, text: str) -> List[str]:
        """
        Split text into smaller chunks for processing.
        
        Args:
            text (str): Input text to split
            
        Returns:
            List[str]: List of text chunks
        """
        return self.text_splitter.split_text(text)

    def generate_document_metadata(self, pdf_path: str) -> dict:
        """
        Generate metadata for the document.
        
        Args:
            pdf_path (str): Path to the PDF file
            
        Returns:
            dict: Document metadata
        """
        return {
            "source": os.path.basename(pdf_path),
            "file_path": pdf_path,
            "type": "pdf"
        }

    def process_and_upload(self, pdf_paths: List[str]):
        """
        Process PDFs and upload their embeddings to Pinecone.
        
        Args:
            pdf_paths (List[str]): List of paths to PDF files
        """
        # Ensure index exists
        self.create_index_if_not_exists()
        index = self.pc.Index(self.index_name)
        
        for pdf_path in pdf_paths:
            # Extract text from PDF
            text = self.extract_text_from_pdf(pdf_path)
            chunks = self.split_text(text)
            
            # Generate metadata
            doc_metadata = self.generate_document_metadata(pdf_path)
            
            # Process chunks and upload to Pinecone
            vectors_to_upsert = []
            
            for i, chunk in enumerate(chunks):
                # Generate embeddings
                embedding = self.embeddings.embed_query(chunk)
                
                # Create a unique ID for the chunk
                chunk_id = hashlib.md5(f"{pdf_path}_{i}".encode()).hexdigest()
                
                # Combine document metadata with chunk-specific metadata
                metadata = {
                    **doc_metadata,
                    "chunk_index": i,
                    "text": chunk
                }
                
                vectors_to_upsert.append({
                    "id": chunk_id,
                    "values": embedding,
                    "metadata": metadata
                })
                
                # Batch upload in groups of 100
                if len(vectors_to_upsert) >= 100:
                    index.upsert(vectors=vectors_to_upsert)
                    vectors_to_upsert = []
            
            # Upload any remaining vectors
            if vectors_to_upsert:
                index.upsert(vectors=vectors_to_upsert)

In [24]:
# import os 
from dotenv import load_dotenv

load_dotenv()
open_api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")

In [None]:
print(open_api_key[:25])
print(pinecone_api_key[:25])

In [None]:
processor = PdfToPineconeProcessor(
    openai_api_key=open_api_key,
    pinecone_api_key=pinecone_api_key,
    # index_name="research-papers"
    index_name="research-paper-on-vehicle-rag-index"
)

pdf_paths = ["./paper/2023_IDETC_paper_Final_v2.pdf", "./paper/s00158-023-03553-5.pdf"]
processor.process_and_upload(pdf_paths)