In [1]:
import os
from langchain.chains import RetrievalQA
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
import json



In [5]:
# Create an empty list to store company data
companies = []

# Define a function to load company data
def load():
    # Create an empty list to store documents
    docs=[]
    # Iterate over each company in the 'companies' list
    for company in companies:
        # Initialize an empty string to store the company text
        text = ""
        # Append company data to the text string
        text += "Name: " + company['_id'] + "\n"
        text += f"Current Assets: {company['currentAssets']}\n"
        text += f"Current Liabilities: {company['currentLiabilities']}\n"
        text += f"Current Assets/Liabilities Ratio: {company['currentRatio_x']}\n"       
        text += f"totalOwnerTransactions: {company['totalOwnerTransactions']}\n"
        text += f"totalIssuerTransactions: {company['totalIssuerTransactions']}\n"
        text += f"totalTransactions: {company['totalTransactions']}\n"
        text += f"totalLiabilities: {company['totalLiabilities']}\n"
        text += f"shareholdersEquity: {company['shareholdersEquity']}\n"
        text += f"Total Liabilities/Equity Ratio: {company['currentRatio_y']}\n"

        # Process 'sharesOutstandingTrend' if it exists in the company data
        if 'sharesOutstandingTrend' in company:
            text += "Shares Outstanding Trend:\n"
            for trend in company['sharesOutstandingTrend']:
                text += f"- Date: {trend['end']}, Value: {trend['val']}\n"
        else:
            text += "Shares Outstanding Trend: N/A\n"
        
        # Create a metadata dictionary for the company
        metadata = dict(
            name=company['_id']
        )
        # Create a Document object with the company text and metadata
        doc = Document(page_content=text, metadata=metadata)
        # Append the Document object to the 'docs' list
        docs.append(doc)


In [6]:
# Read the JSON file and load it into a Python data structure
with open('/Users/qlin/Desktop/Fall2023/NoSQL/Project/Final_Project/Query_Results.json', 'r') as file:
    companies = json.load(file)

# Pass the loaded data to your load function
data = load()

In [7]:
data

[Document(page_content='Name: Apple Inc.\nCurrent Assets: 143566000000.0\nCurrent Liabilities: 145308000000.0\nCurrent Assets/Liabilities Ratio: 0.9880116718\ntotalOwnerTransactions: None\ntotalIssuerTransactions: None\ntotalTransactions: None\ntotalLiabilities: 290437000000.0\nshareholdersEquity: 62146000000.0\nTotal Liabilities/Equity Ratio: 4.6734624916\nShares Outstanding Trend:\n- Date: 2009-10-16, Value: 900678473\n- Date: 2010-10-15, Value: 917307099\n- Date: 2011-10-14, Value: 929409000\n- Date: 2012-10-19, Value: 940692000\n- Date: 2013-10-18, Value: 899738000\n- Date: 2014-10-10, Value: 5864840000\n- Date: 2015-10-09, Value: 5575331000\n- Date: 2016-10-14, Value: 5332313000\n- Date: 2017-10-20, Value: 5134312000\n- Date: 2018-10-26, Value: 4745398000\n- Date: 2019-10-18, Value: 4443265000\n- Date: 2020-10-16, Value: 17001802000\n- Date: 2021-10-15, Value: 16406397000\n- Date: 2022-10-14, Value: 15908118000\n- Date: 2023-10-20, Value: 15552752000\n', metadata={'name': 'Apple I

In [8]:
# split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) 
texts = text_splitter.split_documents(data)

In [9]:
# select which embeddings we want to use
os.environ["OPENAI_API_KEY"] = "Your Own API Key"
embeddings = OpenAIEmbeddings()

In [10]:
# create the vectorestore to use as the index
db = Chroma.from_documents(texts, embeddings)

In [11]:
# expose this index in a retriever interface
retriever = db.as_retriever(
    search_type="similarity", search_kwargs={"k": 5}
)

In [12]:
# Create a chain to answer questions
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(),  # Use the OpenAI language model
    chain_type="map_reduce",  # Use the map-reduce chain type
    retriever=retriever,  # Use the previously defined retriever
    return_source_documents=True,  # Return the source documents in the response
    verbose=True,  # Print detailed logs
)

In [None]:
qa("What's Apple's ticker and exchange?")



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': "What's Apple's ticker and exchange?",
 'result': " Apple's ticker is AAPL and it is traded on the Nasdaq exchange.",
 'source_documents': [Document(page_content='cik: 320193\nentityType: operating\nsicDescription: Electronic Computers\ntickers: AAPL\nexchanges: Nasdaq\nein: 942404110\ncategory: Large accelerated filer\nstateOfIncorporation: CA\nfiscalYearEnd: 0930', metadata={'name': 'Apple Inc.'}),
  Document(page_content='cik: 51143\nentityType: operating\nsicDescription: Computer & office Equipment\ntickers: IBM\nexchanges: NYSE\nein: 130871985\ncategory: Large accelerated filer\nstateOfIncorporation: NY\nfiscalYearEnd: 1231', metadata={'name': 'INTERNATIONAL BUSINESS MACHINES CORP'}),
  Document(page_content='cik: 4962\nentityType: operating\nsicDescription: Finance Services\ntickers: AXP\nexchanges: NYSE\nein: 134922250\ncategory: Large accelerated filer\nstateOfIncorporation: NY\nfiscalYearEnd: 1231', metadata={'name': 'AMERICAN EXPRESS CO'}),
  Document(page_content='

In [13]:
qa("What's Apple's total liability?")



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': "What's Apple's total liability?",
 'result': " Apple's total liabilities is $290,437,000,000.00.",
 'source_documents': [Document(page_content='Name: Apple Inc.\nCurrent Assets: 143566000000.0\nCurrent Liabilities: 145308000000.0\nCurrent Assets/Liabilities Ratio: 0.9880116718\ntotalOwnerTransactions: None\ntotalIssuerTransactions: None\ntotalTransactions: None\ntotalLiabilities: 290437000000.0\nshareholdersEquity: 62146000000.0\nTotal Liabilities/Equity Ratio: 4.6734624916\nShares Outstanding Trend:\n- Date: 2009-10-16, Value: 900678473\n- Date: 2010-10-15, Value: 917307099\n- Date: 2011-10-14, Value: 929409000\n- Date: 2012-10-19, Value: 940692000\n- Date: 2013-10-18, Value: 899738000\n- Date: 2014-10-10, Value: 5864840000\n- Date: 2015-10-09, Value: 5575331000\n- Date: 2016-10-14, Value: 5332313000\n- Date: 2017-10-20, Value: 5134312000\n- Date: 2018-10-26, Value: 4745398000\n- Date: 2019-10-18, Value: 4443265000\n- Date: 2020-10-16, Value: 17001802000\n- Date: 2021-10-15

In [17]:
qa("What's Apple's most recent Entity Common Stock Shares Outstanding?")



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': "What's Apple's most recent Entity Common Stock Shares Outstanding?",
 'result': " Apple's most recent Entity Common Stock Shares Outstanding is 15552752000.",
 'source_documents': [Document(page_content='Name: Apple Inc.\nCurrent Assets: 143566000000.0\nCurrent Liabilities: 145308000000.0\nCurrent Assets/Liabilities Ratio: 0.9880116718\ntotalOwnerTransactions: None\ntotalIssuerTransactions: None\ntotalTransactions: None\ntotalLiabilities: 290437000000.0\nshareholdersEquity: 62146000000.0\nTotal Liabilities/Equity Ratio: 4.6734624916\nShares Outstanding Trend:\n- Date: 2009-10-16, Value: 900678473\n- Date: 2010-10-15, Value: 917307099\n- Date: 2011-10-14, Value: 929409000\n- Date: 2012-10-19, Value: 940692000\n- Date: 2013-10-18, Value: 899738000\n- Date: 2014-10-10, Value: 5864840000\n- Date: 2015-10-09, Value: 5575331000\n- Date: 2016-10-14, Value: 5332313000\n- Date: 2017-10-20, Value: 5134312000\n- Date: 2018-10-26, Value: 4745398000\n- Date: 2019-10-18, Value: 4443265000