In [2]:
# Import necessary libraries
import openai
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone
import pinecone

In [3]:
# Configure the baseline configuration of the OpenAI library for Azure OpenAI Service & Pinecone.
OPENAI_API_KEY = "PLEASE_ENTER_YOUR_OWNED_AOAI_SERVICE_KEY"
OPENAI_API_BASE = "https://PLESAE_ENTER_YOUR_OWNED_AOAI_RESOURCE_NAME.openai.azure.com/"
OPENAI_EMBEDDING_DEPLOYMENT_NAME = "PLEASE_ENTER_YOUR_OWNED_AOAI_EMBEDDING_MODEL_NAME"
OPENAI_EMBEDDING_MODEL_NAME = "text-embedding-ada-002"
OPENAI_API_VERSION = "2023-05-15"
OPENAI_API_TYPE = "azure"
openai.api_key = OPENAI_API_KEY
openai.api_base = OPENAI_API_BASE
openai.api_version = OPENAI_API_VERSION
openai.api_type = OPENAI_API_TYPE
PINECONE_API_KEY = "PLEASE_ENTER_YOUR_OWNED_PINECONE_API_KEY"
PINECONE_ENV = "PLEASE_ENTER_YOUR_OWNED_PINECONE_ENV_NAME"
PINECONE_INDEX_NAME = "PLEASE_ENTER_YOUR_OWNED_PINECONE_INDEX_NAME"

In [4]:
# Create directory loader object to load the data from the directory.
pdfLoader = DirectoryLoader('./data/')

In [6]:
# Initial the document loader to load the data from the directory.
documents = []
documents.extend(pdfLoader.load())

In [7]:
# Display the number of documents in the directory.
print (f'You have {len(documents)} document(s) in your data folder.')

You have 1 document(s) in your data folder.


In [8]:
# Split the documents into chunks of 1000 characters with 200 characters overlap.
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(documents)
print(len(documents))

Created a chunk of size 1164, which is longer than the specified 1000


203


In [9]:
# Define the function to interact with Azure OpenAI Embedding Model.
embeddings = OpenAIEmbeddings(deployment=OPENAI_EMBEDDING_DEPLOYMENT_NAME, 
                                openai_api_key=OPENAI_API_KEY, 
                                model=OPENAI_EMBEDDING_MODEL_NAME, 
                                openai_api_type=OPENAI_API_TYPE, 
                                chunk_size=1)

In [10]:
# Initialize the Pinecone service.
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

# Upload the embeddings to Pinecone.
vectorstore = Pinecone.from_documents(documents, embeddings, index_name=PINECONE_INDEX_NAME)