In [18]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
import json
import datetime
import shutil
from openai import OpenAI

In [19]:
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', '546106c9-d7dd-439d-877b-157320aa3eda')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'gcp-starter')

In [20]:
embeddings = OpenAIEmbeddings(openai_api_key= OPENAI_API_KEY)

In [21]:
def create_knowledge_hub(path_to_10k):
    """From a 10-K document, create a Chroma DB knowledge hub.

    Args:
        path_to_10k: Relative path to the 10-K hosted locally on the user's computer

    Returns:
        vectordb: The vector database with the information from the 10-K
        db_directory: The path to the vector database
    """

    now = datetime.datetime.now()
    timestamp = now.strftime("%Y%m%d%H%M%S")
    db_directory = "db_" + timestamp

    loader = PyPDFLoader(path_to_10k)
    documents = loader.load()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, 
        chunk_overlap=5,
        separators=["\n\n", "\n", " ", ""],
        length_function=len)
    texts = splitter.split_documents(documents)

    vectordb = Chroma.from_documents(
        documents=texts, 
        embedding=embeddings,
        persist_directory=db_directory
    )
    vectordb.persist()

    return vectordb, db_directory

In [22]:
def delete_chroma_db(db_directory):
    """Deletes the Chroma DB created locally on the computer

    Args:
        db_directory: The path to the vector database
    """
    try:
        shutil.rmtree(db_directory)
    except FileNotFoundError:
        print(f"Chroma database '{db_directory}' not found.")
    except Exception as e:
        print(f"Error deleting Chroma database: {str(e)}")

In [16]:
#FOR FINETUNING PURPOSES ONLY
def fill_json(path_to_json, path_to_10k, question, answer):
    
    db, db_dir = create_knowledge_hub(path_to_10k)

    source1 = db.similarity_search(question, k = 2)[0].page_content
    source2 = db.similarity_search(question, k = 2)[1].page_content

    source1 = source1.replace(r'\x', '')
    source2 = source2.replace(r'\x', '')

    source1 = source1.replace('\n', ' ')
    source2 = source2.replace('\n', ' ')

    source1 = source1.replace('\"', ' ')
    source2 = source2.replace('\"', ' ')

    source1 = source1.replace('\'', ' ')
    source2 = source2.replace('\'', ' ')

    ROLE_SYSTEM = "You are a factual chatbot that answers questions about 10-K documents. You only answer with answers you find in the text, no outside information."
    
    my_data = (
        f'{{"messages": [{{"role": "system", "content": "{ROLE_SYSTEM}"}},'
        f'{{"role": "user", "content": "This is our information from the 10-K: {source1} {source2}. Now, this is our question: {question}"}},'
        f'{{"role": "assistant", "content": "{answer}"}}]}}'
        '\n'
    )

    delete_chroma_db(db_dir)

    try:
        with open(path_to_json, "a") as file:
            file.write(my_data)
    except (FileNotFoundError, json.JSONDecodeError):
        return




## Creation of the testset

In [12]:
#Make dataset

question_answer_dic_apple = {
    "How does the company generate its revenue? What are its main products or services?": "The Company designs, manufactures and markets smartphones (iPhones), personal computers (Macbooks and Macs), tablets (iPads), wearables and accessories (Airpods, Apple TV, Apple Watch), and sells a variety of related services.",
    "Does the company operate in multiple geographic regions or industries?": "Yes, the Company has international operations with sales outside the U.S.  In addition, the Company’s global supply chain is large and complex and a majority of the Company’s supplier facilities, including manufacturing and assembly sites, are located outside the U.S.",
    "Are there any significant changes or developments in the company's business model, strategy, or competitive landscape?": "During the COVID-19 pandemic, the Company’s component suppliers and manufacturing and logistical service providers have experienced disruptions, resulting in supply shortages that affected sales worldwide.",
    "What were the company's revenues and net income for the past few years, and how have they changed over time?": "Apple’s revenue was $394.3 billion in 2022, $365.8 billion in 2021 and $275.5 billion in 2020. Revenue has been increasing over time, increasing 8% from 2021 to 2022, and 33% from 2020 to 2021. Net income has also increased steadily, from $57,411 in 2020, to $94,680 in 2021 to $99,803 in 2022.",
    "What are the major drivers of revenue and profit growth or decline?":"The major drivers of revenue growth are net sales from new products, software and services. Some major technology which drove revenue growth were upgrades to the MacBook Pros, new iPhone, AirPods, and Apple Watch releases, as well as updates to the operating systems. Some factors which negatively impacted profit growth was the weakness in currencies relative to the US dollar in areas such as Europe, Japan, and the Rest of Asia Pacific.",
    "Are there any significant trends or patterns in the company's financial statements?": "The primary trends in Apple’s financial statements is that it has consistently increased its revenue and expenses, however the release of new products and services has caused sales to increase at a higher rate than costs, meaning that net profit has been increasing for the past few years as well. The balance sheet also shows that Apple has increased its Assets, Liabilities, and Shareholders equity in the past year.",
    "Who are the company's key executives and board members? What is their experience and track record?": "Timothy D. Cook is the Chief Executive Officer and Director, Luca Maestri is the Senior Vice President, Chief Financial Officer, Chris Kondo is the Senior Director of Corporate Accounting, James A. Bell is the Director, Al Gore is the Director, Alex Gorsky is the Director, Andrea Jung is the Director, Arthur D. Levinson is the Director and Chair of the Board, Monica Lozano is the Director, Ronald D. Sugar is the Director, and Susan L. Wagner is the Director.",
    "What is the company's overall financial health, including its liquidity, solvency, and capital structure?": "The company is in pretty good financial health, considering its ability to increase revenue and profit amidst unfavorable macroeconomic conditions. Apple also has over $156.4 billion in capital resources, and has enough to sustain itself for the next 12 months. ",
    "How much debt does the company have, and what are the terms and conditions?": "Apple has a total debt of $111.8 billion in fixed rate notes as of September 23, 2022. $11.1 billion are payable within 12 months",
    "Are there any contingent liabilities or off-balance-sheet arrangements that could impact the company's financial position?": "According to its Balance Sheet, Apple has no Commitments and contingencies.",
    "What are the primary risks and uncertainties the company faces? How might they impact its future performance?": "Apple’s primary risk factors are dependent on the global and economic conditions of its international/national operations regarding sourcing and manufacturing of products. Also having a highly competitive market they need to ensure the successful development, transitioning and introduction of new innovative products, services and technology.",
    "Has the company identified any new risks or changed its risk profile since the previous filing?": "No the context of the document does not state this information",
    "Are there any legal or regulatory issues that could affect the company's operations?": "The Company’s effective tax rates are affected by changes in the mix of earnings in countries with differing statutory tax rates, changes in the valuation of deferred tax assets and liabilities, the introduction of new taxes, or changes in tax laws or their interpretation, including in the U.S. and Ireland. The application of tax laws may be uncertain, require significant judgment and be subject to differing interpretations.",
}

question_answer_dic_bofa = {
    "How does the company generate its revenue? What are its main products or services?": "The Corporation is a bank holding company and a financial holding company. They serve individual consumers, small- and middle- market businesses, institutional investors, large corporations and governments with a full range of banking, investing, asset management and other financial and risk management products and services.",
    "Does the company operate in multiple geographic regions or industries?": "Yes, the Company operates in international markets in more than 35 countries through the Global Wealth & Investment Management (GWIM), Global Banking and Global Markets",
    "What were the company's revenues and net income for the past few years, and how have they changed over time?": "Bank of America’s revenue was $94.95 billion dollars in 2022 and $80.11 billion dollars in 2021, and $85.528 billion dollars in 2020. The net income in 2022 was $27.528 billion in 2022 and $31.978 billion in 2021 and $43.36 billion in 2020. ",
    "What are the major drivers of revenue and profit growth or decline?": "There are several drivers of growth for each of Bank of America’s businesses. Revenue for Business lending increased due to higher interest rates and loan balances. Sales and Trading Revenue increased because of improved trading performance and improved client financing activities",
    "Are there any significant trends or patterns in the company's financial statements?": "The main trends are the revenue has increased, but income has fallen slightly due to higher costs. Total assets decreased by 4%, including a 34% drop in cash equivalents, and total liability decreased slightly as well with no significant change to shareholders equity.",
    "How much debt does the company have, and what are the terms and conditions?": "Bank of America has $862 billion in total debt securities, and $4.5 billion in long term debt. ",

}
question_answer_dic_nvidia = {
    "How does the company generate its revenue? What are its main products or services? Please give me information for the company NVIDIA": "The Company, originally focused on PC graphics, now creates GPU architecture, highly used in AI, data science, autonomous vehicles among other industries. The business is split into two segments: Compute & Networking, including the Data Center, NVIDIA AI Enterprise, cryptocurrency mining processors, robotics, among more, and the Graphics segment, relating to the GPUs.",
    "Does the company operate in multiple geographic regions or industries? Please give me information for the company NVIDIA": "Yes, the Company’s international operations are a significant part of the business.",
    "Are there any significant changes or developments in the company's business model, strategy, or competitive landscape? Please give me information for the company NVIDIA": "In fiscal year 2023, we introduced the Hopper architecture of data center GPUs, and started shipping the first Hopper- based GPU – the flagship H100. This improves the training of AI transformer models over the prior generation. It has also expanded its data center to include DPUs. They have also introduced the GeForce RTX 40 Series of gaming GPUs.",
    "What were the company's revenues and net income for the past few years, and how have they changed over time? Please give me information for the company NVIDIA": "Revenue increases from $26.914 billion in 2022 to $26.974 billion in 2023, which is a $60 million increase. However costs increased significantly due to spending on research and development which caused net income to decrease from $9.752 billion in 2022 to $4.368 billion in 2021.",
    "What are the major drivers of revenue and profit growth or decline? Please give me information for the company NVIDIA": "The main drivers of revenue in compute and networking was customer growth and multi-year cloud service agreement for NVIDIA’s new AI cloud service. The increase in revenue from graphics was due to an increase in gaming demand due to macroeconomic conditions and the COVID-19 pandemic",
    "Are there any significant trends or patterns in the company's financial statements? Please give me information for the company NVIDIA": "Revenue has been increasing, but costs last year rose significantly causing a dip in net income",
    "What is the company's overall financial health, including its liquidity, solvency, and capital structure? Please give me information for the company NVIDIA": "Nvidia’s main source of liquidity are cash and cash equivalents which is enough to meet operating expenses for the next 12 months, signaling that the company is in good financial health. ",
    "How much debt does the company have, and what are the terms and conditions? Please give me information for the company NVIDIA": "Nvidia as $1.250 billion in debt due in 1 year, $2.250 billion due in 5 years, $4 billion due in five to ten years and $3.5 billion due in more than 10 years. ",
    "Are there any contingent liabilities or off-balance-sheet arrangements that could impact the company's financial position? Please give me information for the company NVIDIA": "Nvidia has unrecognized tax benefits of $1.02 billion for interests and penalties",
    
}

In [98]:
#For Apple
aapl_file_path = "../Documents/aapl-10-k.pdf"
json_path = "./Datasets/test10k.jsonl"

for item in question_answer_dic_apple:
    fill_json(json_path, aapl_file_path, item, question_answer_dic_apple[item])


Chroma database 'db_20231024221245' deleted successfully.
Chroma database 'db_20231024221254' deleted successfully.
Chroma database 'db_20231024221302' deleted successfully.
Chroma database 'db_20231024221310' deleted successfully.
Chroma database 'db_20231024221318' deleted successfully.
Chroma database 'db_20231024221327' deleted successfully.
Chroma database 'db_20231024221335' deleted successfully.
Chroma database 'db_20231024221343' deleted successfully.
Chroma database 'db_20231024221352' deleted successfully.
Chroma database 'db_20231024221403' deleted successfully.
Chroma database 'db_20231024221411' deleted successfully.
Chroma database 'db_20231024221420' deleted successfully.
Chroma database 'db_20231024221428' deleted successfully.


In [96]:
#For BofA
bofa_file_path = "../Documents/bankofamerica-10K.pdf"
json_path = "./Datasets/test10k.jsonl"

for item in question_answer_dic_bofa:
    fill_json(json_path, bofa_file_path, item, question_answer_dic_bofa[item])

Chroma database 'db_20231024221110' deleted successfully.
Chroma database 'db_20231024221134' deleted successfully.


In [99]:
#For NVIDIA
nvidia_file_path = "../Documents/nvda-10-k.pdf"
json_path = "./Datasets/test10k.jsonl"

for item in question_answer_dic_nvidia:
    fill_json(json_path, nvidia_file_path, item, question_answer_dic_nvidia[item])

Chroma database 'db_20231024221450' deleted successfully.
Chroma database 'db_20231024221518' deleted successfully.


## Using the fine tuned GPT Model

In [23]:
def ask_gpt_finetuned_model(path_to_10k, question):
    """Ask the fine-tuned GPT model a question based off a local 10-K document.

    Args:
        path_to_10k: Relative path to the 10-K hosted locally on the user's computer
        question: Question to ask the model

    Returns:
        answer: The answer given by the fine-tuned GPT model
    """

    db, db_dir = create_knowledge_hub(path_to_10k)

    source1 = db.similarity_search(question, k = 2)[0].page_content
    source2 = db.similarity_search(question, k = 2)[1].page_content

    client = OpenAI()

    completion = client.chat.completions.create(
        model="ft:gpt-3.5-turbo-0613:personal:anote:8DO8V2LB",
        messages=[
            {"role": "system", "content": "You are a factual chatbot that answers questions about 10-K documents. You only answer with answers you find in the text, no outside information."},
            {"role": "user", "content": f"{source1}{source2} Now, this is our question: {question}"}
        ]
    )

    delete_chroma_db(db_dir)
    
    answer = completion.choices[0].message.content

    return answer



In [24]:
ask_gpt_finetuned_model("../Documents/aapl-10-k.pdf", "What are the company's strategic priorities and growth prospects?")

'The company’s research and development are focused on developing new and innovative products and technologies. Given the rapid pace of technological advances, the company believes that continual investment in research and development is critical to retain its competitive advantage.'

## Using the original GPT Model

In [8]:
def ask_gpt_nonfinetuned_model(path_to_10k, question):
    """Ask the original GPT model a question based off a local 10-K document.

    Args:
        path_to_10k: Relative path to the 10-K hosted locally on the user's computer
        question: Question to ask the model

    Returns:
        answer: The answer given by the GPT model
    """

    db, db_dir = create_knowledge_hub(path_to_10k)

    source1 = db.similarity_search(question, k = 2)[0].page_content
    source2 = db.similarity_search(question, k = 2)[1].page_content

    client = OpenAI()

    completion = client.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a factual chatbot that answers questions about 10-K documents. You only answer with answers you find in the text, no outside information."},
            {"role": "user", "content": f"{source1}{source2} Now, this is our question: {question}"}
        ]
    )

    delete_chroma_db(db_dir)
    
    answer = completion.choices[0].message.content

    return answer

In [16]:
ask_gpt_nonfinetuned_model("../Documents/dbx-10-k.pdf", "What were the company's revenues and net income for the past few years, and how have they changed over time?")

"According to the information provided in the 10-K document, the company's revenues and net income for the past two years are as follows:\n\nYear ended December 31, 2021:\n- Revenue: $2,157.9 million\n- Net income: $335.8 million\n\nYear ended December 31, 2022:\n- Revenue: $2,324.9 million\n- Net income: $553.2 million\n\nThe revenues increased by $167.0 million or 8% from 2021 to 2022. Net income increased from $335.8 million in 2021 to $553.2 million in 2022."

## Evaluation of the model

In [9]:
questions_to_ask_trained_model = ["How does the company generate its revenue? What are its main products or services?",
                     "Does the company operate in multiple geographic regions or industries?",
                     "Are there any significant changes or developments in the company's business model, strategy, or competitive landscape?",
                     "What were the company's revenues and net income for the past few years, and how have they changed over time?",
                     "What are the major drivers of revenue and profit growth or decline?",
                     "Are there any significant trends or patterns in the company's financial statements?",
                     "Who are the company's key executives and board members? What is their experience and track record?",
                     "Are there any related-party transactions or potential conflicts of interest?",
                     "Does the company have effective internal controls and risk management procedures in place?",
                     "What is the company's overall financial health, including its liquidity, solvency, and capital structure?",
                     "How much debt does the company have, and what are the terms and conditions?",
                     "Are there any contingent liabilities or off-balance-sheet arrangements that could impact the company's financial position?",
                     "What are the primary risks and uncertainties the company faces? How might they impact its future performance?",
                     "Has the company identified any new risks or changed its risk profile since the previous filing?",
                     "Are there any legal or regulatory issues that could affect the company's operations?"  
                     ]

path_to_test_10ks = ["../Documents/dbx-10-k.pdf", "../Documents/google-10-k.pdf", "../Documents/netflix-10-k.pdf"]


In [10]:
filename = 'answers_newest.txt' #replace this with the name of the file you want to write to

with open(filename, 'a') as file: 
    for question in questions_to_ask_trained_model:
        for company_path in path_to_test_10ks:
            # Obtain the answers
            gpt_answer = ask_gpt_nonfinetuned_model(company_path, question)
            your_model_answer = ask_gpt_finetuned_model(company_path, question)
            
            # Write the answers to the text file
            file.write(f'Company Path: {company_path}\n')
            file.write(f'Question: {question}\n')
            file.write(f"GPT's Answer: {gpt_answer}\n")
            file.write(f'Your Model\'s Answer: {your_model_answer}\n')
            file.write('\n')