In [29]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

import pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os

load_dotenv()
import openai
# Set up the Pinecone vector database
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
PINECONE_ENV = "us-east-1-aws"
pinecone.init(api_key=PINECONE_API_KEY, environment="us-east-1-aws")
index_name = "doc-chat2"
index = pinecone.Index(index_name)

In [30]:

# Set up the LangChain OpenAI embedding model
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
model_name = "text-embedding-ada-002"
embed = OpenAIEmbeddings(model=model_name, openai_api_key=OPENAI_API_KEY)

In [31]:
loader = PyPDFLoader("District_12/2005/2005-Ohio-143.pdf")
data = loader.load()
# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your sample document')
# print (f'Here is a sample: {data[0].page_content[:200]}')

You have 14 document(s) in your data
There are 1137 characters in your sample document


In [32]:
# We'll split our data into chunks around 500 characters each with a 50 character overlap. These are relatively small.

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
text_chunks = text_splitter.split_documents(data)

In [33]:
def create_embedding(chunk):
    response = openai.Embedding.create(
            input=chunk,
            model="text-embedding-ada-002"
        )
    return response["data"][0]["embedding"]

In [34]:
import uuid
pinecone_index = pinecone.Index(index_name=index_name)

embeddings = []

for chunk in text_chunks:
    id = uuid.uuid4().hex
    embedding = create_embedding(chunk)
    embeddings.append([(id, embedding, {"text": chunk})])

APIRemovedInV1: 

You tried to access openai.Embedding, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742


In [26]:
print (f'Now you have {len(text_chunks)} chunks of text')

Now you have 53 chunks of text


In [10]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [11]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [20]:

# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_ENV  # next to api key in console
)
pinecone_index = pinecone.Index(index_name=index_name)

import uuid

for embedding in embeddings:
    id = uuid.uuid4().hex
    embedding = create_embedding(chunk)
    pinecone_index.upsert(embedding)


SyntaxError: unterminated string literal (detected at line 15) (3237950807.py, line 15)

QUERY THE DOCUMENT THAT WAS JUST VECTORIZED IN PINECONE


In [36]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
llm = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")
query = "What is the name of the defendant in this case?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

'The name of the defendant in this case is Nationwide Agribusiness Insurance Company.'