In [None]:
from langchain.document_loaders import PyPDFLoader

In [None]:
import tiktoken

tiktoken.encoding_for_model('gpt-3.5-turbo')

In [None]:
tokenizer = tiktoken.get_encoding("cl100k_base")

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
             "we can find the length of this chunk of text in tokens")

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [None]:
loader = PyPDFLoader("test-data/pdf/ebay.pdf")
pages = loader.load_and_split(text_splitter)

len(pages)

In [None]:
from getpass import getpass

OPENAI_API_KEY = getpass("OpenAI API Key: ")

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

In [None]:
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

res = embed.embed_documents(texts)
len(res), len(res[0])

In [None]:
import pinecone

# find API key in console at app.pinecone.io
YOUR_API_KEY = getpass("Pinecone API Key: ")
# find ENV (cloud region) next to API key in console
YOUR_ENV = input("Pinecone environment: ")

index_name = 'basic'
pinecone.init(
    api_key=YOUR_API_KEY,
    environment=YOUR_ENV
)

if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=len(res[0])  # 1536 dim of text-embedding-ada-002
    )

In [None]:
index = pinecone.Index(index_name)

index.describe_index_stats()

In [None]:
pages[0]

In [None]:
from tqdm.auto import tqdm
from uuid import uuid4

texts = []
metadatas = []

for i, record in enumerate(tqdm(pages)):
    metadata = {
        'page': str(record.metadata['page']),
        'source': record.metadata['source'],
        'title': record.metadata['source'],
        'text': record.page_content,
    }

    texts.append(record.page_content)
    metadatas.append(metadata)

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

In [None]:
index.describe_index_stats()

In [None]:
from langchain.vectorstores import Pinecone

text_field = "text"

vectorstore = Pinecone(index, embed, text_field)

In [None]:
query = "where was Carl Icahn born?"

vectorstore.similarity_search_with_score(query, k=3)

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", temperature=0.0)

qa = RetrievalQAWithSourcesChain.from_chain_type(llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever(search_kwargs=dict(k=3)), return_source_documents=True)

In [None]:
qa(query)