In [1]:
import tiktoken
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

  from tqdm.autonotebook import tqdm


In [2]:
tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)


In [3]:
loader = PyPDFLoader("./aapl-10-k.pdf")
data = loader.load()

In [4]:
# How can we alter arguments for the text splitter? 
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

texts = text_splitter.split_documents(data)


In [5]:
type(texts[0])

langchain.schema.document.Document

In [6]:
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', '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')

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', '546106c9-d7dd-439d-877b-157320aa3eda')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'gcp-starter')

In [7]:
embeddings = OpenAIEmbeddings(openai_api_key= OPENAI_API_KEY)

In [8]:
pinecone.init(
    api_key= PINECONE_API_KEY,
    environment= PINECONE_API_ENV 
)
index_name = 'langchaintest2'

if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=len(res[0])  # 1536 dim of text-embedding-ada-002
    )

In [9]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [11]:
text_field = "text"

# switch back to normal index for langchain
index = pinecone.Index(index_name)

vectorstore = Pinecone(
    index, embeddings.embed_query, text_field
)



In [19]:
query = "What is the company's revenue?"

vectorstore.similarity_search(
    query,  # our search query
    k=1  # return 3 most relevant docs
)

[Document(page_content='For the sale of third-party products where the Company obtains control of the product before transferring it to the customer, theCompany recognizes revenue based on the gross amount billed to customers. The Company considers multiple factors whendetermining whether it obtains control of third-party products, including evaluating if it can establish the price of the product, retainsinventory risk for tangible products or has the responsibility for ensuring acceptability of the product. For third-party applications soldthrough the App Store and certain digital content sold through the Company’s other digital content stores, the Company does not obtaincontrol of the product before transferring it to the customer. Therefore, the Company accounts for such sales on a net basis byrecognizing in Services net sales only the commission it retains.The Company records revenue net of taxes collected from customers that are remitted to governmental authorities, with the colle

In [14]:
# completion llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [15]:
qa.run(query)

"The company's revenue for 2022 was $394,328 million."

In [16]:
qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

## Compare Model answers with human answers

In [20]:
def compare_strings(text1, text2):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])
    # Calculate the cosine similarity between the vectors
    similarity = cosine_similarity(vectors)
    print(similarity)
    return similarity
    