In [32]:
from dotenv import load_dotenv
import os
from pydantic import BaseModel, Field
from langchain_openai   import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain.agents import create_tool_calling_agent, AgentExecutor
from tools import search_tool
from langchain.chains import LLMMathChain
from langchain.tools import Tool

In [33]:
from langchain_openai import ChatOpenAI

load_dotenv()

llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0,
    max_retries=2,
    # api_key="...",
    # base_url="...",
    # organization="...",
    # other params...
)

class PydanticModelResponse(BaseModel):
    topic: str
    summary: str
    sources : list[str]
    tools_used: list[str]



parser = PydanticOutputParser(pydantic_object=PydanticModelResponse)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", " You are a helpful assistant that can answer questions about a given topic. \
         you will filter the "),
        ("human", "{input}"),
        ("placeholder", "{chat_history}"),
        ("placeholder", "{agent_scratchpad}"),
    ]
).partial(format_instructions=parser.get_format_instructions())

# problem_chain = LLMMathChain.from_llm(
#     llm=llm)
# math_tool = Tool.from_function(name="Calculator",
#                 func=problem_chain.run,
#                  description="Useful for when you need to answer questions about math. \
#                     This tool is only for math questions and nothing else. Only input math expressions.")

# tools  = [search_tool, math_tool]
tools = [Tool.from_function(name="Filter DataFrame",
                     func=filter_df,
                     description="Useful for filtering a DataFrame based on a column and value.")]
# Create the agent with the LLM, prompt, and tools
agent = create_tool_calling_agent(
    llm=llm,
    prompt=prompt,
    tools = tools
)

agent_executor = AgentExecutor(agent = agent, tools = tools, verbose=True)

In [36]:
import pandas as pd
df_data = pd.read_csv("./data/train.csv")
df_data = df_data[~(df_data.notna().sum(axis=1) == 0)]

## Using langchian agent - create_pandas_dataframe_agent

This is a great tool to interact with an datframe using prompts  
CAUTION - you have to precalculate evrything in using python code and then you can use this tool just to create a query around the datframe  


 **Security Notice**:
        This agent relies on access to a python repl tool which can execute
        arbitrary code. This can be dangerous and requires a specially sandboxed
        environment to be safely used. Failure to run this code in a properly
        sandboxed environment can lead to arbitrary code execution vulnerabilities,
        which can lead to data breaches, data loss, or other security incidents.
        
        
        Do not use this code with untrusted inputs, with elevated permissions,
        or without consulting your security team about proper sandboxing!
        You must opt-in to use this functionality by setting allow_dangerous_code=True.
        
        
        This agent relies on access to a python repl tool which can execute 
        arbitrary code. This can be dangerous and requires a specially sandboxed 
        environment to be safely used.

**A Python REPL** (Read-Evaluate-Print Loop) is an interactive environment that allows you to execute Python code and see the results immediately. It's a fundamental part of the Python interpreter, providing a way to experiment with code, test ideas, and explore Python's features. 
Here's a breakdown of what happens in a Python REPL: 
Read: The REPL reads the Python code you enter.
Evaluate: It then executes that code.
Print: The result of the execution is displayed.
Loop: The process repeats, allowing you to enter more code and see the results.

In [37]:
from langchain_experimental.agents import create_pandas_dataframe_agent

agent_executor = create_pandas_dataframe_agent(
    llm,
    df_data,
    agent_type="zero-shot-react-description", # or AgentType.OPENAI_FUNCTIONS for OpenAI models
    verbose=True, # Set to True to see the agent's thought process
    return_intermediate_steps=True, # Optional, to get intermediate steps, 
    allow_dangerous_code = True
)

In [38]:
agent_executor.invoke(
    {
        "input": "Filter the DataFrame to show only the data of Athens, show top 5 rows \
            get the brand which is most popular in Athens and \
            summarize the data in a table format. calculate popularity based upon quantity column" ,
        "chat_history": [],
        "agent_scratchpad": ""
    }
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to filter the dataframe to show only the data of Athens, then find the most popular brand based on quantity, and summarize the data in a table format.
Action: python_repl_ast
Action Input: df[df['city'] == 'Athens'][0m[36;1m[1;3m          id      date    city       lat      long       pop    shop  \
0        0.0  31/01/12  Athens  37.97945  23.71622  672130.0  shop_1   
1        1.0  31/01/12  Athens  37.97945  23.71622  672130.0  shop_1   
2        2.0  31/01/12  Athens  37.97945  23.71622  672130.0  shop_1   
3        3.0  31/01/12  Athens  37.97945  23.71622  672130.0  shop_1   
4        4.0  31/01/12  Athens  37.97945  23.71622  672130.0  shop_1   
...      ...       ...     ...       ...       ...       ...     ...   
6461  6461.0  31/12/17  Athens  37.97945  23.71622  665871.0  shop_1   
6464  6464.0  31/12/17  Athens  37.96245  23.68708  665871.0  shop_3   
6469  6469.0  31/12/17  Athens  37.96245  2

{'input': 'Filter the DataFrame to show only the data of Athens, show top 5 rows             get the brand which is most popular in Athens and             summarize the data in a table format. calculate popularity based upon quantity column',
 'chat_history': [],
 'agent_scratchpad': '',
 'output': 'The most popular brand in Athens is kinder-cola, and the summarized data based on quantity for each brand is as follows:\n- adult-cola: 8,954,816\n- gazoza: 14,917,959\n- kinder-cola: 5,590,614\n- lemon-boost: 8,127,506\n- orange-power: 12,533,071',
 'intermediate_steps': [(AgentAction(tool='python_repl_ast', tool_input="df[df['city'] == 'Athens']", log="Thought: I need to filter the dataframe to show only the data of Athens, then find the most popular brand based on quantity, and summarize the data in a table format.\nAction: python_repl_ast\nAction Input: df[df['city'] == 'Athens']"),
             id      date    city       lat      long       pop    shop  \
   0        0.0  31/01/12  Ath