In [1]:
import tiktoken
import PyPDF2
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
import faiss
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

  from tqdm.autonotebook import tqdm


Following this tutorial: https://dipankarmedh1.medium.com/exploring-the-power-of-rag-and-openais-function-calling-for-question-answering-d512c45c56b5

In [2]:
!pip install faiss-cpu



In [3]:
def extract_text_from_pdf(file_path: str) -> str:
        # Create a PDF reader object
    pdf_reader = PyPDFLoader(file_path)
    data = pdf_reader.load()
    return data
pdf_path = "./aapl-10-k.pdf"


In [4]:
def get_data_chunks(data: str, chunk_size: int):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=5,
                        separators=["\n\n", "\n", " ", ""], length_function=len)
    #chunks = text_splitter.split_text(data)
    chunks = text_splitter.split_documents(data)
    return chunks

In [5]:
print(extract_text_from_pdf(pdf_path))

[Document(page_content='5/7/23, 6:29 PMaapl-20220924\nhttps://www.sec.gov/Archives/edgar/data/320193/000032019322000108/aapl-20220924.htm1/78UNITED STATESSECURITIES AND EXCHANGE COMMISSIONWashington, D.C. 20549FORM 10-K(Mark One)☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934For the ﬁscal year ended September\xa024, 2022or☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934For the transition period from \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 to \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0.Commission File Number: 001-36743\nApple Inc.(Exact name of Registrant as speciﬁed in its charter)California94-2404110(State or other jurisdictionof incorporation or organization)(I.R.S. Employer Identiﬁcation No.)One Apple Park WayCupertino, California95014(Address of principal executive ofﬁces)(Zip Code)(408) 996-1010(Registrant’s telephone number, including area code)Securities registered pursuant to Section 

In [6]:
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', '546106c9-d7dd-439d-877b-157320aa3eda')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'gcp-starter')

In [7]:
pinecone.init(
    api_key= PINECONE_API_KEY,
    environment= PINECONE_API_ENV 
)
index_name = 'langchaintest2'

if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=len(res[0])  # 1536 dim of text-embedding-ada-002
)
    


In [8]:
embeddings = OpenAIEmbeddings(openai_api_key= OPENAI_API_KEY)

In [9]:
def create_knowledge_hub(chunks: list):
    index = pinecone.Index(index_name)
    text_field = "text"
    embeddings = OpenAIEmbeddings(openai_api_key= OPENAI_API_KEY)
    knowledge_hub = Pinecone(index, embeddings.embed_query, text_field)
    return knowledge_hub

In [10]:
def get_answer_LLM(
        question: str,
        data: str,
        chunk_size: int = 1000,
        chain_type: str = 'stuff',
    ) -> str:    
    if data == "":
        return ""
    
    chunks = get_data_chunks(data, chunk_size=chunk_size)  # create text chunks
    knowledge_hub = create_knowledge_hub(chunks)  # create knowledge hub

    retriever = knowledge_hub.as_retriever(
        search_type="similarity", search_kwargs={"k": 2}
    )
    source = knowledge_hub.similarity_search(query, k = 1)
    chain = RetrievalQA.from_chain_type(
        llm=OpenAI(temperature=0.3, model_name="gpt-3.5-turbo", openai_api_key = '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'),
        chain_type=chain_type,
        retriever=retriever,
        return_source_documents=True,
    )
    result = chain({"query": question})

    return result['result'], source

In [11]:
file_path = "./aapl-10-k.pdf"
data = extract_text_from_pdf(file_path)

In [12]:
query = "What was the company's revenue"
get_answer_LLM(query, data, chunk_size = 400)




("The company's revenue for 2022 was $394,328 million.",
 [Document(page_content='certain share-based compensation expenses, income taxes, various nonrecurringcharges and other separately managed general and administrative costs. The Company does not include intercompany transfersbetween segments for management reporting purposes.Note 2 – RevenueNet sales disaggregated by signiﬁcant products and services for 2022, 2021 and 2020 were as follows (in millions):202220212020iPhone$205,489\xa0$191,973\xa0$137,781\xa0Mac 40,177\xa035,190\xa028,622\xa0iPad29,292\xa031,862\xa023,724\xa0Wearables, Home and Accessories 41,241\xa038,367\xa030,620\xa0Services 78,129\xa068,425\xa053,768\xa0Total net sales $394,328\xa0$365,817\xa0$274,515\xa0(1)Products net sales include amortization of the deferred value of unspeciﬁed software upgrade rights, which are bundled in the salesprice of the respective product.(2)Wearables, Home and Accessories net sales include sales of AirPods, Apple TV, Apple Watch, Bea

In [13]:
query = "What were the company's revenues and net income for the past few years, and how have they changed over time?
get_answer_LLM(query, data, chunk_size = 400)


("The company's revenue for 2022 was $394,328 million.",
 [Document(page_content='certain share-based compensation expenses, income taxes, various nonrecurringcharges and other separately managed general and administrative costs. The Company does not include intercompany transfersbetween segments for management reporting purposes.Note 2 – RevenueNet sales disaggregated by signiﬁcant products and services for 2022, 2021 and 2020 were as follows (in millions):202220212020iPhone$205,489\xa0$191,973\xa0$137,781\xa0Mac 40,177\xa035,190\xa028,622\xa0iPad29,292\xa031,862\xa023,724\xa0Wearables, Home and Accessories 41,241\xa038,367\xa030,620\xa0Services 78,129\xa068,425\xa053,768\xa0Total net sales $394,328\xa0$365,817\xa0$274,515\xa0(1)Products net sales include amortization of the deferred value of unspeciﬁed software upgrade rights, which are bundled in the salesprice of the respective product.(2)Wearables, Home and Accessories net sales include sales of AirPods, Apple TV, Apple Watch, Bea

In [14]:
query = "What are the major drivers of revenue and profit growth or decline?"
get_answer_LLM(query, data, chunk_size = 400)[0]

"The major drivers of revenue and profit growth or decline for the company are the gross margins on its products and services, industry-wide global product pricing pressures, increased competition, the ability to stimulate demand for certain products and services, compressed product life cycles, supply shortages, potential increases in costs, shifts in product and service mix, fluctuations in foreign exchange rates, inflation and other macroeconomic pressures, and the introduction of new products or services. Additionally, seasonal holiday demand and new product and service introductions can significantly impact net sales, cost of sales, and operating expenses. A decline in demand for a significant portion of the company's net sales from a single product could also significantly impact quarterly net sales."

## Compare Model Answers with Human Answers

In [15]:
def compare_strings(text1, text2):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])
    # Calculate the cosine similarity between the vectors
    similarity = cosine_similarity(vectors)
    print(similarity)
    return similarity

In [16]:
query = "What is the company's overall financial health, including its liquidity, solvency, and capital structure"
model_answer = get_answer_LLM(query, data, chunk_size = 400)[0]


In [17]:
model_answer

"The given context does not provide any specific information about the company's overall financial health, liquidity, solvency, or capital structure."

In [18]:
query = "How much debt does the company have, and what are the terms and conditions?"
model_answer = get_answer_LLM(query, data, chunk_size = 400)[0]
model_answer_source = get_answer_LLM(query, data, chunk_size = 400)[1]

In [19]:
model_answer

'Based on the provided information, the company has $10.0 billion of Commercial Paper outstanding as of September 24, 2022. The terms and conditions of the debt are not specified in the given context.'

In [20]:
model_answer_source

[Document(page_content='Note 7 – DebtCommercial Paper and Repurchase AgreementsThe Company issues unsecured short-term promissory notes (“Commercial Paper”) pursuant to a commercial paper program. TheCompany uses net proceeds from the commercial paper program for general corporate purposes, including dividends and sharerepurchases. As of September\xa024, 2022 and September\xa025, 2021, the Company had $10.0 billion and $6.0 billion of Commercial Paperoutstanding, respectively, with maturities generally less than nine months. The weighted-average interest rate of the Company’sCommercial Paper was 2.31% and 0.06% as of September\xa024, 2022 and September\xa025, 2021, respectively. The following table providesa summary of cash ﬂows associated with the issuance and maturities of Commercial Paper for 2022, 2021 and 2020 (in millions):202220212020Maturities 90 days or less:Proceeds from/(Repayments of) commercial paper, net$5,264\xa0$(357)$100\xa0Maturities greater than 90 days:Proceeds from