In [4]:
import PyPDF2
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
import json
import re

  from tqdm.autonotebook import tqdm


Following this tutorial: https://dipankarmedh1.medium.com/exploring-the-power-of-rag-and-openais-function-calling-for-question-answering-d512c45c56b5

In [1]:
def extract_text_from_pdf(file_path: str) -> str:
        # Create a PDF reader object
    pdf_reader = PyPDFLoader(file_path)
    data = pdf_reader.load()
    
    return data

In [5]:
def get_data_chunks(data: str, chunk_size: int):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=5,
                        separators=["\n\n", "\n", " ", ""], length_function=len)
    #chunks = text_splitter.split_text(data)
    chunks = text_splitter.split_documents(data)
    return chunks

In [6]:
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', '546106c9-d7dd-439d-877b-157320aa3eda')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'gcp-starter')

In [7]:
pinecone.init(
    api_key= PINECONE_API_KEY,
    environment= PINECONE_API_ENV 
)
index_name = 'langchaintest2'

if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=1536  # 1536 dim of text-embedding-ada-002
)
    


In [8]:
embeddings = OpenAIEmbeddings(openai_api_key= OPENAI_API_KEY)

In [9]:
def create_knowledge_hub(chunks: list):
    index = pinecone.Index(index_name)
    text_field = "text"
    embeddings = OpenAIEmbeddings(openai_api_key= OPENAI_API_KEY)
    knowledge_hub = Pinecone(index, embeddings.embed_query, text_field)
    return knowledge_hub

In [10]:
def get_answer_LLM(
        question: str,
        data: str,
        chunk_size: int = 1000,
        chain_type: str = 'stuff',
    ) -> str:    
    if data == "":
        return ""
    
    chunks = get_data_chunks(data, chunk_size=chunk_size)  # create text chunks
    knowledge_hub = create_knowledge_hub(chunks)  # create knowledge hub

    retriever = knowledge_hub.as_retriever(
        search_type="similarity", search_kwargs={"k": 2}
    )
    source = knowledge_hub.similarity_search(question, k = 1)
    print("test", source)

    chain = RetrievalQA.from_chain_type(
        llm=OpenAI(temperature=0.3, model_name="gpt-3.5-turbo", openai_api_key = OPENAI_API_KEY),
        chain_type=chain_type,
        retriever=retriever,
        return_source_documents=True,
    )
    result = chain({"query": question})

    return result['result'], source

In [11]:
def get_relevant_chunk(
        question: str,
        data: str,
        chunk_size: int = 1000,
    ) -> str:    
    if data == "":
        return ""
    
    chunks = get_data_chunks(data, chunk_size=chunk_size)  # create text chunks
    knowledge_hub = create_knowledge_hub(chunks)  # create knowledge hub

    source = knowledge_hub.similarity_search(question, k = 2)

    print("most relevant chunk is,", source)
    return source

In [15]:
def fill_train_data(
        question: str, #our own question
        pdf_path: str,
):
    
        """
        This function will take in our question and the path to the pdf, find the most relevant chunk and return that chunk
        """

        #getting the most relevant chunk
        CHUNK_SIZE = 400
        data = extract_text_from_pdf(pdf_path)
        context = get_relevant_chunk(question, data, CHUNK_SIZE)

        page_content = context[0].page_content
        page_content = page_content.replace(r'\x', '')
        
        #print(page_content)

        return page_content

        #return context

In [16]:
ROLE_SYSTEM = "You are a factual chatbot that answers questions about 10-K documents. You only answer with answers you find in the text, no outside information."

def add_to_training_file(path_to_json, question, answer, context):
    my_data = (
        f'{{"messages": [{{"role": "system", "content": "{ROLE_SYSTEM}"}},'
        f'{{"role": "user", "content": "This is our information from the 10-K: {context}. Now, this is our question: {question}"}},'
        f'{{"role": "assistant", "content": "{answer}"}}]}}'
        '\n'
    )

    try:
        with open(path_to_json, "a") as file:
            file.write(my_data)
    except (FileNotFoundError, json.JSONDecodeError):
        return



In [20]:
def fill_json(path_to_json, pdf_path, question, answer):
    context = fill_train_data(question, pdf_path)
    print(context)
    add_to_training_file(path_to_json, question, answer, context)

In [21]:
fill_json("./Datasets/my_test_oct24", "../Documents/aapl-10-k.pdf", "How does the company generate its revenue? What are its main products or services?", "The Company designs, manufactures and markets smartphones (iPhones), personal computers (Macbooks and Macs), tablets (iPads), wearables and accessories (Airpods, Apple TV, Apple Watch), and sells a variety of related services.")



Business Seasonality and Product IntroductionsThe Company has historically experienced higher net sales in its ﬁrst quarter compared to other quarters in its ﬁscal year due in part toseasonal holiday demand. Additionally, new product and service introductions can signiﬁcantly impact net sales, cost of sales andoperating expenses. The timing of product introductions can also impact the Company’s net sales to its indirect distribution channels asthese channels are ﬁlled with new inventory following a product launch, and channel inventory of an older product often declines as thelaunch of a newer product approaches. Net sales can also be affected when consumers and distributors anticipate a productintroduction.Human CapitalThe Company believes it has a talented, motivated and dedicated team, and works to create an inclusive, safe and supportiveenvironment for all of its team members. As of September 24, 2022, the Company had approximately 164,000 full-time equivalentemployees.Workplace Pr

In [22]:
#Make dataset

question_answer_dic_apple = {
    "How does the company generate its revenue? What are its main products or services?": "The Company designs, manufactures and markets smartphones (iPhones), personal computers (Macbooks and Macs), tablets (iPads), wearables and accessories (Airpods, Apple TV, Apple Watch), and sells a variety of related services.",
    "Does the company operate in multiple geographic regions or industries?": "Yes, the Company has international operations with sales outside the U.S.  In addition, the Company’s global supply chain is large and complex and a majority of the Company’s supplier facilities, including manufacturing and assembly sites, are located outside the U.S.",
    "Are there any significant changes or developments in the company's business model, strategy, or competitive landscape?": "During the COVID-19 pandemic, the Company’s component suppliers and manufacturing and logistical service providers have experienced disruptions, resulting in supply shortages that affected sales worldwide.",
    "What were the company's revenues and net income for the past few years, and how have they changed over time?": "Apple’s revenue was $394.3 billion in 2022, $365.8 billion in 2021 and $275.5 billion in 2020. Revenue has been increasing over time, increasing 8% from 2021 to 2022, and 33% from 2020 to 2021. Net income has also increased steadily, from $57,411 in 2020, to $94,680 in 2021 to $99,803 in 2022.",
    "What are the major drivers of revenue and profit growth or decline?":"The major drivers of revenue growth are net sales from new products, software and services. Some major technology which drove revenue growth were upgrades to the MacBook Pros, new iPhone, AirPods, and Apple Watch releases, as well as updates to the operating systems. Some factors which negatively impacted profit growth was the weakness in currencies relative to the US dollar in areas such as Europe, Japan, and the Rest of Asia Pacific.",
    "Are there any significant trends or patterns in the company's financial statements?": "The primary trends in Apple’s financial statements is that it has consistently increased its revenue and expenses, however the release of new products and services has caused sales to increase at a higher rate than costs, meaning that net profit has been increasing for the past few years as well. The balance sheet also shows that Apple has increased its Assets, Liabilities, and Shareholders equity in the past year.",
    "Who are the company's key executives and board members? What is their experience and track record?": "Timothy D. Cook is the Chief Executive Officer and Director, Luca Maestri is the Senior Vice President, Chief Financial Officer, Chris Kondo is the Senior Director of Corporate Accounting, James A. Bell is the Director, Al Gore is the Director, Alex Gorsky is the Director, Andrea Jung is the Director, Arthur D. Levinson is the Director and Chair of the Board, Monica Lozano is the Director, Ronald D. Sugar is the Director, and Susan L. Wagner is the Director.",
    "What is the company's overall financial health, including its liquidity, solvency, and capital structure?": "The company is in pretty good financial health, considering its ability to increase revenue and profit amidst unfavorable macroeconomic conditions. Apple also has over $156.4 billion in capital resources, and has enough to sustain itself for the next 12 months. ",
    "How much debt does the company have, and what are the terms and conditions?": "Apple has a total debt of $111.8 billion in fixed rate notes as of September 23, 2022. $11.1 billion are payable within 12 months",
    "Are there any contingent liabilities or off-balance-sheet arrangements that could impact the company's financial position?": "According to its Balance Sheet, Apple has no Commitments and contingencies.",
    "What are the primary risks and uncertainties the company faces? How might they impact its future performance?": "Apple’s primary risk factors are dependent on the global and economic conditions of its international/national operations regarding sourcing and manufacturing of products. Also having a highly competitive market they need to ensure the successful development, transitioning and introduction of new innovative products, services and technology.",
    "Has the company identified any new risks or changed its risk profile since the previous filing?": "No the context of the document does not state this information",
    "Are there any legal or regulatory issues that could affect the company's operations?": "The Company’s effective tax rates are affected by changes in the mix of earnings in countries with differing statutory tax rates, changes in the valuation of deferred tax assets and liabilities, the introduction of new taxes, or changes in tax laws or their interpretation, including in the U.S. and Ireland. The application of tax laws may be uncertain, require significant judgment and be subject to differing interpretations.",
}

question_answer_dic_bofa = {
    "How does the company generate its revenue? What are its main products or services?": "The Corporation is a bank holding company and a financial holding company. They serve individual consumers, small- and middle- market businesses, institutional investors, large corporations and governments with a full range of banking, investing, asset management and other financial and risk management products and services.",
    "Does the company operate in multiple geographic regions or industries?": "Yes, the Company operates in international markets in more than 35 countries through the Global Wealth & Investment Management (GWIM), Global Banking and Global Markets",
    "What were the company's revenues and net income for the past few years, and how have they changed over time?": "Bank of America’s revenue was $94.95 billion dollars in 2022 and $80.11 billion dollars in 2021, and $85.528 billion dollars in 2020. The net income in 2022 was $27.528 billion in 2022 and $31.978 billion in 2021 and $43.36 billion in 2020. ",
    "What are the major drivers of revenue and profit growth or decline?": "There are several drivers of growth for each of Bank of America’s businesses. Revenue for Business lending increased due to higher interest rates and loan balances. Sales and Trading Revenue increased because of improved trading performance and improved client financing activities",
    "Are there any significant trends or patterns in the company's financial statements?": "The main trends are the revenue has increased, but income has fallen slightly due to higher costs. Total assets decreased by 4%, including a 34% drop in cash equivalents, and total liability decreased slightly as well with no significant change to shareholders equity.",
    "How much debt does the company have, and what are the terms and conditions?": "Bank of America has $862 billion in total debt securities, and $4.5 billion in long term debt. ",

}
#ISSUE HERE TOO
question_answer_dic_nvidia = {
    "How does the company generate its revenue? What are its main products or services? Please give me information for the company NVIDIA": "The Company, originally focused on PC graphics, now creates GPU architecture, highly used in AI, data science, autonomous vehicles among other industries. The business is split into two segments: Compute & Networking, including the Data Center, NVIDIA AI Enterprise, cryptocurrency mining processors, robotics, among more, and the Graphics segment, relating to the GPUs.",
    "Does the company operate in multiple geographic regions or industries? Please give me information for the company NVIDIA": "Yes, the Company’s international operations are a significant part of the business.",
    "Are there any significant changes or developments in the company's business model, strategy, or competitive landscape? Please give me information for the company NVIDIA": "In fiscal year 2023, we introduced the Hopper architecture of data center GPUs, and started shipping the first Hopper- based GPU – the flagship H100. This improves the training of AI transformer models over the prior generation. It has also expanded its data center to include DPUs. They have also introduced the GeForce RTX 40 Series of gaming GPUs.",
    "What were the company's revenues and net income for the past few years, and how have they changed over time? Please give me information for the company NVIDIA": "Revenue increases from $26.914 billion in 2022 to $26.974 billion in 2023, which is a $60 million increase. However costs increased significantly due to spending on research and development which caused net income to decrease from $9.752 billion in 2022 to $4.368 billion in 2021.",
    "What are the major drivers of revenue and profit growth or decline? Please give me information for the company NVIDIA": "The main drivers of revenue in compute and networking was customer growth and multi-year cloud service agreement for NVIDIA’s new AI cloud service. The increase in revenue from graphics was due to an increase in gaming demand due to macroeconomic conditions and the COVID-19 pandemic",
    "Are there any significant trends or patterns in the company's financial statements? Please give me information for the company NVIDIA": "Revenue has been increasing, but costs last year rose significantly causing a dip in net income",
    "What is the company's overall financial health, including its liquidity, solvency, and capital structure? Please give me information for the company NVIDIA": "Nvidia’s main source of liquidity are cash and cash equivalents which is enough to meet operating expenses for the next 12 months, signaling that the company is in good financial health. ",
    "How much debt does the company have, and what are the terms and conditions? Please give me information for the company NVIDIA": "Nvidia as $1.250 billion in debt due in 1 year, $2.250 billion due in 5 years, $4 billion due in five to ten years and $3.5 billion due in more than 10 years. ",
    "Are there any contingent liabilities or off-balance-sheet arrangements that could impact the company's financial position? Please give me information for the company NVIDIA": "Nvidia has unrecognized tax benefits of $1.02 billion for interests and penalties",
    
}

In [26]:
aapl_file_path = "../Documents/aapl-10-k.pdf"
bofa_file_path = "../Documents/bankofamerica-10K.pdf"
nvidia_file_path = "../Documents/nvda-10-k.pdf"

json_path = ".Datasets/openaitest_new.jsonl"

for item in question_answer_dic_apple:
    fill_json(json_path, aapl_file_path, item, question_answer_dic_apple[item])

#THE ISSUE IS HERE (The database is not finding the most relevant section from BofA, it is instead finding it from Apple)
for item in question_answer_dic_bofa:
    fill_json(json_path, bofa_file_path, item, question_answer_dic_bofa[item])

#THE ISSUE IS HERE (The database is not finding the most relevant section from NVIDIA, it is instead finding it from Apple)
for item in question_answer_dic_nvidia:
    fill_json(json_path, nvidia_file_path, item, question_answer_dic_nvidia[item])



In [14]:
nvidia_file_path = "../Documents/nvda-10-k.pdf"
json_path = "./Datasets/openaitest_new.jsonl"


for item in question_answer_dic_nvidia:
    fill_json(json_path, nvidia_file_path, item, question_answer_dic_nvidia[item])



The resulting dataset from this is at ./Datasets/all_10k_dataset.jsonl