In [82]:
import os
from openai import OpenAI


from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
api_key = os.environ['OPENAI_API_KEY']

from langchain_openai import ChatOpenAI
#memory and conversation
from langchain.chains import ConversationChain
from langchain.memory import ConversationSummaryMemory
#chain
from langchain.chains import LLMChain
from langchain.chains import SequentialChain
from langchain_core.runnables import RunnableMap
#prompts
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
#retrieval
from langchain.chains import RetrievalQA
from langchain_openai import OpenAIEmbeddings

from langchain.vectorstores import DocArrayInMemorySearch

from langchain.indexes import VectorstoreIndexCreator
from langchain.document_loaders import CSVLoader
from IPython.display import display, Markdown

In [24]:
file = 'OutdoorClothingCatalog_1000.csv'
loader = CSVLoader(file_path=file)

In [28]:
llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0.9,
    max_tokens=None,
    timeout=None,
)

In [29]:
# Initialize the embedding model
embedding = OpenAIEmbeddings()

In [30]:
# Create the index with the embedding model
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embedding
).from_loaders([loader])

In [33]:
# Query the index using the LLM
retriever = index.vectorstore.as_retriever()
retrieval_qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# Perform a query using `invoke`
query = "What is the content of the file?"
response = retrieval_qa.invoke({"query": query})
print(response)


{'query': 'What is the content of the file?', 'result': 'The content of the file includes product features, such as a zip-fly front, an internal key fob, a zippered security pocket, and two back pockets with Velcro-closures. It also mentions that the item is imported and that assembly is required, with instructions included.'}


In [38]:
query ="Please list all your shirts with sun protection \
in a table in markdown and summarize each one."
response = retrieval_qa.invoke({"query": query})
markdown=response["result"]

display(Markdown(markdown))

Here is a table of shirts with sun protection, along with their summaries:

| Name                                       | Summary                                                                                           |
|--------------------------------------------|---------------------------------------------------------------------------------------------------|
| Men's Tropical Plaid Short-Sleeve Shirt    | Lightest hot-weather shirt with UPF 50+, wrinkle-resistant, and features cape venting and bellows pockets. Made of 100% polyester. |
| Men's Plaid Tropic Shirt, Short-Sleeve     | Ultracomfortable shirt with UPF 50+, quick-dry, and wrinkle-free, originally designed for fishing. Made of 52% polyester and 48% nylon. |
| Men's TropicVibe Shirt, Short-Sleeve       | Sun-protection shirt with UPF 50+, wrinkle-resistant, and cape venting. Made of 71% nylon and 29% polyester. |
| Sun Shield Shirt                           | High-performance sun shirt with UPF 50+, moisture-wicking, and abrasion-resistant. Made of 78% nylon and 22% Lycra. |

All shirts provide the highest sun protection rating of UPF 50+, blocking 98% of UV rays.

In [39]:
###BREAKDOWN FURTHER

from langchain.document_loaders import CSVLoader
loader = CSVLoader(file_path=file)

In [40]:
docs = loader.load()
docs[0]
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
embed = embeddings.embed_query("Hi my name is Harrison")

print(len(embed))
print(embed[:5])

1536
[-0.02196465528695117, 0.006758838256223806, -0.018249490165056663, -0.03923515029463157, -0.014007174091135742]


In [44]:
db = DocArrayInMemorySearch.from_documents(
    docs, 
    embeddings
)

In [45]:
query = "Please suggest a shirt with sunblocking"
docs = db.similarity_search(query)
len(docs)
docs[0]
retriever = db.as_retriever()
qdocs = "".join([docs[i].page_content for i in range(len(docs))])


In [46]:
response = llm.call_as_llm(f"{qdocs} Question: Please list all your \
shirts with sun protection in a table in markdown and summarize each one.") 
display(Markdown(response))

  response = llm.call_as_llm(f"{qdocs} Question: Please list all your \


Certainly! Below is a markdown table summarizing the shirts with sun protection features:

| Name                                  | Description                                                                                                                             | Main Features                                                                                   |
|---------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------|
| Sun Shield Shirt                      | High-performance sun shirt with UPF 50+ protection. Slightly fitted and soft, made with 78% nylon and 22% Lycra Xtra Life fiber.        | Moisture-wicking, abrasion-resistant, fits over swimsuits, hand wash, line dry.                 |
| Men's Plaid Tropic Shirt, Short-Sleeve| UPF 50+ sun protection shirt originally designed for fishing. Made of 52% polyester and 48% nylon.                                      | Wrinkle-free, moisture-evaporating, front and back cape venting, machine washable and dryable. |
| Men's TropicVibe Shirt, Short-Sleeve  | Lightweight sun-protection shirt with UPF 50+. Made with 71% nylon and 29% polyester shell, and 100% polyester knit mesh lining.        | Wrinkle-resistant, front and back cape venting, machine wash and dry.                          |
| Men's Tropical Plaid Short-Sleeve Shirt| UPF 50+ hot-weather protection. Made of 100% polyester for a relaxed fit.                                                                | Wrinkle-resistant, front and back cape venting, highest rated sun protection.                  |

Each shirt offers UPF 50+ sun protection, which blocks 98% of harmful UV rays. They are designed to be comfortable, functional, and provide reliable sun safety, whether for fishing, travel, or leisure.

In [63]:
#lets analyze a 3 statement model
import pandas as pd
from langchain.docstore.document import Document


#Step 1: Load the file
finfile = '3Statement.xlsx'
finmodel = pd.read_excel(finfile)


In [64]:
# Step 2: Define a Custom Loader Class
class PandasDataFrameLoader:
    def __init__(self, df):
        self.df = df

    def load(self):
        documents = []
        for _, row in self.df.iterrows():
            # Combine all columns into a single string for each row
            content = " ".join([f"{col}: {value}" for col, value in row.items()])
            documents.append(Document(page_content=content))
        return documents

In [74]:
# Create the loader
loader = PandasDataFrameLoader(finmodel)

# Step 3: Initialize the Embedding Model
embedding = OpenAIEmbeddings()

# Step 4: Create the Index with the Custom Loader
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embedding
).from_loaders([loader])

retriever = index.vectorstore.as_retriever()
# Create a RetrievalQA chain
retrieval_qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever
)


In [81]:

# Step 5: Query the Index
query = "You are looking at a 3 statement model. In each row is a title for the line item, and the values for 2019, 2020, 2021, 2022, 2023, 2024, 2025 and 2026. What is the Products revenue from 2019. Take the first column in the row once you find it."
response = retrieval_qa.invoke({"query": query})
print(response)

{'query': 'You are looking at a 3 statement model. In each row is a title for the line item, and the values for 2019, 2020, 2021, 2022, 2023, 2024, 2025 and 2026. What is the Products revenue from 2019. Take the first column in the row once you find it.', 'result': 'According to the data, the total revenue for 2019 is 13118. However, the context provided does not specify a separate line item for "Products revenue," so I cannot determine the product revenue amount specifically from the information given.'}


In [86]:
from langchain.chains.retrieval import create_retrieval_chain
from langchain.memory import ConversationBufferMemory

In [88]:
# Step 3: Define Memory for Conversation
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)


In [90]:
# Step 4: Create a Question Generator Chain
from langchain.prompts import PromptTemplate
question_generator_prompt = PromptTemplate(
    input_variables=["chat_history", "question"],
    template=(
        "Given the following conversation history:\n{chat_history}\n"
        "and the new question:\n{question}\n"
        "Generate a follow-up question if necessary."
    )
)
question_generator = LLMChain(llm=llm, prompt=question_generator_prompt)

  question_generator = LLMChain(llm=llm, prompt=question_generator_prompt)


In [100]:

# Step 5: Create Combine Docs Chain
combine_docs_chain = load_qa_chain(llm=llm, chain_type="stuff")

# Step 6: Create Conversational Retrieval Chain
conversational_chain = ConversationalRetrievalChain(
    retriever=retriever,  # Directly attach the retriever here
    memory=memory,
    question_generator=question_generator,
    combine_docs_chain=combine_docs_chain,
    verbose=True
)

# Step 7: Run Queries
query_1 = "Please read the file, which has a balance sheet, an income statement, and a statement of cash flows. The columns are defined at the top -- for years 2019, 2020, 2021, 2022, 2023, 2024, 2025, and 2026. Row values represent those years in order. Confirm that you see the file and describe it."
response_1 = conversational_chain.invoke({"question": query_1})
print(f"Q1: {query_1}")
print(f"A1: {response_1['answer']}")



[1m> Entering new ConversationalRetrievalChain chain...[0m

[1m> Finished chain.[0m
Q1: Please read the file, which has a balance sheet, an income statement, and a statement of cash flows. The columns are defined at the top -- for years 2019, 2020, 2021, 2022, 2023, 2024, 2025, and 2026. Row values represent those years in order. Confirm that you see the file and describe it.
A1: Based on the context provided, it appears that the financial statements include income statements and cash flow statements spanning from December 31, 2019, to December 31, 2026. However, the specific figures for revenue, net income, cash flow from operations, and cash flow from financing are not included in the data I have access to.

If you can provide specific figures or metrics from these statements, I would be happy to help analyze them or discuss any trends or changes you might be concerned about.


In [101]:
query_1 = "They are provided, they are just not labeled well. Every row has a name of a line item. Please list all of the line item names that you see."
response_1 = conversational_chain.invoke({"question": query_1})
print(f"Q1: {query_1}")
print(f"A1: {response_1['answer']}")



[1m> Entering new ConversationalRetrievalChain chain...[0m

[1m> Finished chain.[0m
Q1: They are provided, they are just not labeled well. Every row has a name of a line item. Please list all of the line item names that you see.
A1: Based on the context you provided, it seems you're looking at a cash flow statement that includes categories like "Cash Flow from Operations," "Cash Flow from Investing," and "Cash Flow from Financing." I can provide insights into these:

1. **Cash Flow from Operations**: This represents the cash generated or used by a company's core business operations. It's an important measure because it indicates if a company can generate sufficient positive cash flow to maintain and grow its operations, or if it will need external financing.

2. **Cash Flow from Investing**: This section includes cash spent on or generated from investments in the business, such as purchasing equipment or investments in securities. Negative cash flow from investing is not necessar

In [102]:
data[10]

NameError: name 'data' is not defined