In [13]:
# Load environment variables
import os
import re
from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv(),override=True)

True

In [None]:
# Check api keys
# print(os.getenv('OPENAI_API_KEY'))
# print(os.getenv('PINECONE_ENVIRONMENT'))
# print(os.getenv('PINECONE_API_KEY'))

In [14]:
# Import and instantiate OpenAI embeddings
from langchain.embeddings import OpenAIEmbeddings
embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002")

In [15]:
# Import and initialize Pinecone client
import pinecone
import os
from langchain.vectorstores import Pinecone

pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment=os.getenv('PINECONE_ENVIRONMENT') 
)
pinecone.whoami()


WhoAmIResponse(username=None, user_label=None, projectname='962e1db')

In [16]:
# Find the existing index, clear for new start
index_name = "feadocs"
index=pinecone.Index(index_name)
# index.delete(delete_all=True) # Clear the index first, then upload

In [17]:
# Import parsers

from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [19]:
# Pinecone database: https://app.pinecone.io/organizations/-Nam3zmbSmzuXKeH8EWl/projects/us-west1-gcp-free:32467cc/indexes/langchain-quickstart
import glob

data_folder='../data_fea/'
docs = glob.glob(data_folder+'*.pdf')   # Only get the PDFs in the directory

for doc in docs:
    print('Parsing: '+doc)
    loader = PyPDFLoader(doc)
    data = loader.load_and_split()
    
    # This is optional, but needed to play with the data parsing.
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=0)
    texts = text_splitter.split_documents(data)

    for text in texts:

        text.metadata['source']=os.path.basename(text.metadata['source'])   # Strip path
        text.metadata['page']=text.metadata['page']+1   # Pages are 0 based, update
        # Merge hyphenated words
        text.page_content=re.sub(r"(\w+)-\n(\w+)", r"\1\2", text.page_content)
        # Fix newlines in the middle of sentences
        text.page_content = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.page_content.strip())
        # Remove multiple newlines
        text.page_content = re.sub(r"\n\s*\n", "\n\n", text.page_content)
    
    print('Uploading to pinecone index '+index_name)
    vectorstore = Pinecone.from_documents(texts, embeddings_model, index_name=index_name)