In [1]:
# Import packages
import ollama
import langchain
import sys
import os
import json
import pandas as pd
import numpy as np

# Other stuff
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.document_loaders import OnlinePDFLoader

In [2]:
# Check your venv
print(sys.prefix)

/mnt/gigadrive/llm-smarthome/venv


In [3]:
!ollama list

NAME                           	ID          	SIZE  	MODIFIED          
codellama:13b-instruct-fp16    	220d80e5d12b	26 GB 	28 hours ago     	
codellama:13b-instruct-q8_0    	196822804b09	13 GB 	27 hours ago     	
codellama:13b-python-fp16      	46a84a4b71b0	26 GB 	28 hours ago     	
dolphin-mixtral:latest         	cfada4ba31c7	26 GB 	29 hours ago     	
mistral:7b-instruct-fp16       	7334da3db4d2	14 GB 	33 hours ago     	
mistral:7b-instruct-v0.2-fp16  	094d67ff087c	14 GB 	33 hours ago     	
mixtral:8x7b-instruct-v0.1-q8_0	a6689be5de7d	49 GB 	33 hours ago     	
mixtral:latest                 	7708c059a8bb	26 GB 	2 days ago       	
mxbai-embed-large:latest       	468836162de7	669 MB	About an hour ago	


In [4]:
file = "./data/GE.pdf"

if file:
    loader = UnstructuredPDFLoader(file_path=file)
    data = loader.load()
else:
    print("Stupid idiot")

In [5]:
print(data[0].page_content)

(617) 443-3000_______________________________________________(Former name or former address, if changed since last report.)Check the appropriate box below if the Form 8-K filing is intended to simultaneously satisfy the filing obligation of the registrant under any of the followingprovisions (see General Instructions A.2. below):☐Written communications pursuant to Rule 425 under the Securities Act (17 CFR 230.425)☐Soliciting material pursuant to Rule 14a-12 under the Exchange Act (17 CFR 240.14a-12) ☐Pre-commencement communications pursuant to Rule 14d-2(b) under the Exchange Act (17 CFR 240.14d-2(b))☐Pre-commencement communications pursuant to Rule 13e-4(c) under the Exchange Act (17 CFR 240.13e-4(c))Securities registered pursuant to Section 12(b) of the Act:

Title of each classTrading Symbol(s)Name of each exchange on which registeredCommon stock, par value $0.01 per shareGENew York Stock Exchange0.875% Notes due 2025GE 25New York Stock Exchange1.875% Notes due 2027GE 27ENew York St

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512,chunk_overlap=51)
chunks = text_splitter.split_documents(data)


In [7]:
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="mxbai-embed-large", show_progress=True),
    collection_name="GE-2024-8K"
)


OllamaEmbeddings: 100%|███████████████████████| 135/135 [00:03<00:00, 35.73it/s]


In [8]:
print(len(chunks))

135


In [9]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [10]:
# LLM from Ollama
local_model = 'mixtral'
llm = ChatOllama(model=local_model)

In [11]:
QUERY_PROMPT = PromptTemplate(input_variables=["question"], 
                              template="""
                              You are an AI language model assistant named Plex. You retrieve documents from 
                              vector databases and provide additional questions for the user once you read and
                              analyzed the code. 
                              """)

In [12]:
retriever = MultiQueryRetriever.from_llm(vector_db.as_retriever(), llm, prompt=QUERY_PROMPT)

template = """ANSWER THE QUESTION BASED ONLY ON THE FOLLOWING CONTEXT: {context}"""

prompt = ChatPromptTemplate.from_template(template)

In [13]:
chain = (
    {"context":retriever, "question": RunnablePassthrough()}
    | prompt
    | llm 
    | StrOutputParser()
)

In [14]:
chain.invoke(input(""))

 What was GE's ending cash position?


OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00,  1.02it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 68.49it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 48.38it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 41.72it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 43.18it/s]


" The documents provided seem to be related to financial statements and forward-looking statements of General Electric (GE) and its subsidiaries. The first document contains preliminary unaudited supplemental consolidated financial position information, cash flows information, earnings information, and segment information for the year ended December 31, 2023, and each of the interim periods within that year. The second document appears to be a table showing the estimated impact of the separation of GE Vernova on GE's financial statements.\n\nThe third document is a table showing the reconciliation of non-GAAP financial measures to their most directly comparable GAAP financial measures for the year ended December 31, 2023. Non-GAAP financial measures are used by management and investors as a supplement to, and not as a substitute for, GAAP financial measures. They provide additional information about GE's operating performance and help investors understand underlying trends in GE's busi