In [4]:
import os
from langchain.chains import RetrievalQA
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
import json

In [5]:
companies = []

def load():
    docs=[]
    for company in companies:
        text = ""
        text += "cik: " + company['cik'] + "\n"
        text += f"entityType: {company['entityType']}\n"
        text += f"sicDescription: {company['sicDescription']}\n"

        # Handle the 'tickers' field, which is an array
        tickers = ', '.join(company['tickers'])
        text += f"tickers: {tickers}\n"
        
        # Handle the 'exchanges' field, which is also an array
        exchanges = ', '.join(company['exchanges'])
        text += f"exchanges: {exchanges}\n"
        text += f"ein: {company['ein']}\n"
        text += f"category: {company['category']}\n"
        text += f"stateOfIncorporation: {company['stateOfIncorporation']}\n"
        text += f"fiscalYearEnd: {company['fiscalYearEnd']}\n"
        
        metadata = dict(
            #source=company['id'],
            name=company['name']
        )
        doc = Document(page_content=text, metadata=metadata)
        docs.append(doc)
    return docs

#data = load()

In [6]:
# Read the JSON file and load it into a Python data structure
with open('/Users/qlin/Desktop/Fall2023/NoSQL/Project/Final_Project/filingMetadata.json', 'r') as file:
    companies = json.load(file)

# Pass the loaded data to your load function
data = load()

In [7]:
data

[Document(page_content='cik: 320193\nentityType: operating\nsicDescription: Electronic Computers\ntickers: AAPL\nexchanges: Nasdaq\nein: 942404110\ncategory: Large accelerated filer\nstateOfIncorporation: CA\nfiscalYearEnd: 0930\n', metadata={'name': 'Apple Inc.'}),
 Document(page_content='cik: 0000789019\nentityType: operating\nsicDescription: Services-Prepackaged Software\ntickers: MSFT\nexchanges: Nasdaq\nein: 911144442\ncategory: Large accelerated filer\nstateOfIncorporation: WA\nfiscalYearEnd: 0630\n', metadata={'name': 'MICROSOFT CORP'}),
 Document(page_content='cik: 0001652044\nentityType: operating\nsicDescription: Services-Computer Programming, Data Processing, Etc.\ntickers: GOOGL, GOOG\nexchanges: Nasdaq, Nasdaq\nein: 611767919\ncategory: Large accelerated filer\nstateOfIncorporation: DE\nfiscalYearEnd: 1231\n', metadata={'name': 'Alphabet Inc.'}),
 Document(page_content='cik: 0001652044\nentityType: operating\nsicDescription: Services-Computer Programming, Data Processing, 

In [8]:
# split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(data)

In [9]:
# select which embeddings we want to use
os.environ["OPENAI_API_KEY"] = "api"
embeddings = OpenAIEmbeddings()

In [10]:
# create the vectorestore to use as the index
db = Chroma.from_documents(texts, embeddings)

In [11]:
# expose this index in a retriever interface
retriever = db.as_retriever(
    search_type="similarity", search_kwargs={"k": 5}
)

In [12]:
# create a chain to answer questions
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="map_reduce",
    retriever=retriever,
    return_source_documents=True,
    verbose=True,
)

In [26]:
qa("What's Apple's ticker and exchange?")



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': "What's Apple's ticker and exchange?",
 'result': " Apple's ticker is AAPL and it is traded on the Nasdaq exchange.",
 'source_documents': [Document(page_content='cik: 320193\nentityType: operating\nsicDescription: Electronic Computers\ntickers: AAPL\nexchanges: Nasdaq\nein: 942404110\ncategory: Large accelerated filer\nstateOfIncorporation: CA\nfiscalYearEnd: 0930', metadata={'name': 'Apple Inc.'}),
  Document(page_content='cik: 51143\nentityType: operating\nsicDescription: Computer & office Equipment\ntickers: IBM\nexchanges: NYSE\nein: 130871985\ncategory: Large accelerated filer\nstateOfIncorporation: NY\nfiscalYearEnd: 1231', metadata={'name': 'INTERNATIONAL BUSINESS MACHINES CORP'}),
  Document(page_content='cik: 4962\nentityType: operating\nsicDescription: Finance Services\ntickers: AXP\nexchanges: NYSE\nein: 134922250\ncategory: Large accelerated filer\nstateOfIncorporation: NY\nfiscalYearEnd: 1231', metadata={'name': 'AMERICAN EXPRESS CO'}),
  Document(page_content='