In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

financial_data = pd.read_csv('../data/cleaned_data.csv')
financial_data_filter = financial_data[['pre_text','post_text','question','answer']]
financial_data_filter.head()

Unnamed: 0,pre_text,post_text,question,answer
0,"26 | 2009 annual report in fiscal 2008 , reven...","year ended june 30 , cash provided by operatio...",what was the percentage change in the net cash...,14.1%
1,"26 | 2009 annual report in fiscal 2008 , reven...","year ended june 30 , cash provided by operatio...",what was the percentage change in the net cash...,14.1%
2,"26 | 2009 annual report in fiscal 2008 , reven...","year ended june 30 , cash provided by operatio...",what was the percentage change in the net cash...,14.1%
3,"26 | 2009 annual report in fiscal 2008 , reven...","year ended june 30 , cash provided by operatio...",what was the percentage change in the net cash...,14.1%
4,substantially all of the goodwill and other in...,the above unaudited pro forma financial inform...,what was the percent of the growth in the reve...,1.3%


In [2]:
# Combine pre_text and post_text to create context for each row
financial_data_filter['context'] = financial_data_filter['pre_text'] + " " + financial_data_filter['post_text']
financial_data_filter.head()

Unnamed: 0,pre_text,post_text,question,answer,context
0,"26 | 2009 annual report in fiscal 2008 , reven...","year ended june 30 , cash provided by operatio...",what was the percentage change in the net cash...,14.1%,"26 | 2009 annual report in fiscal 2008 , reven..."
1,"26 | 2009 annual report in fiscal 2008 , reven...","year ended june 30 , cash provided by operatio...",what was the percentage change in the net cash...,14.1%,"26 | 2009 annual report in fiscal 2008 , reven..."
2,"26 | 2009 annual report in fiscal 2008 , reven...","year ended june 30 , cash provided by operatio...",what was the percentage change in the net cash...,14.1%,"26 | 2009 annual report in fiscal 2008 , reven..."
3,"26 | 2009 annual report in fiscal 2008 , reven...","year ended june 30 , cash provided by operatio...",what was the percentage change in the net cash...,14.1%,"26 | 2009 annual report in fiscal 2008 , reven..."
4,substantially all of the goodwill and other in...,the above unaudited pro forma financial inform...,what was the percent of the growth in the reve...,1.3%,substantially all of the goodwill and other in...


In [3]:
# Concatenating 'question', 'answer', and 'context' into a single 'combined' column
financial_data_filter['combined'] = financial_data_filter['question'] + " " + financial_data_filter['answer'] + " " + financial_data_filter['context']

In [4]:
financial_data_filter_subset = financial_data_filter.head(1000)

In [8]:
financial_data_filter_subset = financial_data_filter_subset.dropna()

## **Generate the Document from the DataFrame**

In [5]:
from langchain.document_loaders import DataFrameLoader
from langchain.vectorstores import Chroma

In [9]:

# Using the concatenated 'combined' column as the input to DataFrameLoader
df_loader = DataFrameLoader(financial_data_filter_subset, page_content_column="question")
df_document = df_loader.load()

display(df_document)

[Document(metadata={'pre_text': '26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and sh

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=10)
texts = text_splitter.split_documents(df_document)

In [12]:
from langchain_community.embeddings import OllamaEmbeddings

embedding_function = OllamaEmbeddings(
    model="mxbai-embed-large",
)

In [None]:
chromadb_index = Chroma.from_documents(
    texts, embedding_function, persist_directory='./input'
)

In [None]:
from langchain.chains import RetrievalQA
from langchain.llms import Ollama
from langchain_core.output_parsers import StrOutputParser

In [None]:
llm = Ollama(model="lama3.1")
cdb_retriever = chromadb_index.as_retriever()

qa_chain = RetrievalQA.from_chain_type(llm, retriever=cdb_retriever, chain_type="stuff")
# User question
question = "What was the percentage change in the net cash?"

result = qa_chain.run(question)
print(result)

## **Using the new LCEL architecture**

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

template = """Answer the question based on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

In [None]:
chain = (
    {"context": cdb_retriever, "question": RunnablePassthrough()}
    | prompt
    | hf_llm
    | StrOutputParser()
)

In [None]:
chain.invoke(question)