In [1]:
# ! pip install python-dotenv
# ! pip install langchain --upgrade --no-cache-dir
! pip install wandb --upgrade --no-cache-dir

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [4]:
%load_ext autoreload

In [5]:
%autoreload

import sys
import os
import json

os.environ[ "LANGCHAIN_WANDB_TRACING" ] = "true"
# wandb documentation to configure wandb using env variables
# https://docs.wandb.ai/guides/track/advanced/environment-variables
# here we are configuring the wandb project name
os.environ[ "WANDB_PROJECT" ] = "langchain-dataframe-agent"

path = "/var/genie-in-the-box/src/lib"
if path not in sys.path:
    sys.path.append( path )
else:
    print( f"[{path}] already in sys.path" )

print( sys.path )

import util as du

path = "/var/genie-in-the-box/src"
du.add_to_path( path )

import util_stopwatch as sw
import util_langchain as ulc
import genie_client as gc

from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
import langchain
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.agents.agent_toolkits.pandas.base import create_pandas_dataframe_agent

from langchain import PromptTemplate
from langchain import LLMChain

import pandas as pd

from IPython.display import display, Markdown


['/Applications/PyCharm.app/Contents/plugins/python/helpers-pro/jupyter_debug', '/Applications/PyCharm.app/Contents/plugins/python/helpers/pydev', '/var', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/usr/local/lib/python3.10/dist-packages', '/usr/lib/python3/dist-packages', '/var/genie-in-the-box/src/lib']
Added [/var/genie-in-the-box/src] to sys.path


In [116]:
pandas_prompt_template = """
You are working with a pandas dataframe in Python. The name of the dataframe is `df`.

This is the ouput from `print(df.head().to_csv())`, in CSV format:

"start_date","end_date","start_time","end_time","event_type","recurrent","recurrence_interval","priority_level","name","relationship","description_who_what_where"
"2023-07-01","2023-07-04","00:00","23:59","Concert",False,"","none","Jenny","coworker","Concert of Jenny at the city center"
"2023-07-01","2023-07-01","05:25","17:22","TODO",False,"","highest","Gregorio","friend","Send out invitations for the party for Gregorio"
"2023-07-01","2023-07-01","13:27","01:59","Appointment",False,"","high","Leroy Ruiz","father","Appointment with Leroy Ruiz at the clinic"
"2023-07-03","2023-07-04","00:00","23:59","Subscription",True,"1 year","highest","Leroy Ruiz","father","Renewal of Leroy Ruiz's subscription"
"2023-07-03","2023-07-04","00:00","23:59","Anniversary",True,"4 week","none","Juan","neighbor","Juan's anniversary celebration at the park"
"2023-07-04","2023-07-04","00:00","23:59","Anniversary",True,"4 week","medium","Leroy Ruiz","father","Leroy Ruiz's anniversary celebration at the park"

This is the from `print(df.dtypes)`:

start_date                    datetime64[ns]
end_date                      datetime64[ns]
start_time                            object
end_time                              object
event_type                            object
recurrent                               bool
recurrence_interval                   object
priority_level                        object
name                                  object
relationship                          object
description_who_what_where            object
dtype: object

As you generate the python code needed to answer the question asked of you below, I want you to:

1) Question: Ask yourself If you understand the question that I am asking you?
2) Think: Before you do anything, think out loud about what I'm asking you to do, including what are the steps you need to take to solve this problem? What are the things you need to consider? Be critical of your thought process!
3) Code: Generate a verbatim list of code that you used to arrive at your answer, one line of code per item on the list. The code must be complete, syntactically correct, and run to completion. The last line of your code must be the variable `solution`, which represents the answer.
4) Return: Report on the object type of the variable `solution` in your last line of code.
5) Explain: Briefly and succinctly explain your code in plain English.
6) Answer: Answer the question in a short, simple, terse sentence.

Format: return your response as a JSON object in the following fields:
{{
    "question": "the question that your code answers",
    "thoughts": "your thoughts",
    "code": [],
    "return": "Object type of the variable `solution`",
    "explanation": "your brief explanation of your code",
    "answer": "your very succinct answer, in conversational English, one sentence only."    
}}

Begin!

Question: {question}
"""

In [117]:
langchain.debug = True
langchain.verbose = True

llm_4 = ChatOpenAI( model_name="gpt-4-0613", temperature=0.0 )
memory = ConversationBufferMemory( memory_key="chat_history", return_messages=True, verbose=True )
raw_df = pd.read_csv( du.get_project_root() + "/src/conf/long-term-memory/events.csv" )

df_agent = create_pandas_dataframe_agent( llm_4, raw_df, verbose=True )

prompt = PromptTemplate(
    template=pandas_prompt_template,
    input_variables=[ "question" ]
)
llm_chain = LLMChain(
    prompt=prompt,
    llm=llm_4
)
df_agent.memory = memory

In [157]:
def get_question_and_coda( event_query ):
    question_plus_coda_template = """
    Question: {event_query}  
    
    Hint: An event that I have today may have started before today and may end tomorrow or next week, so be careful how you filter on dates.
    Hint: When filtering by dates, use `pd.Timestamp( day )` to convert a Python datetime object into a Pandas `datetime64[ns]` value.
    Hint: If your code returns any records, they should include all columns in the dataframe.
    """
    # You must verify that your code runs to completion, once it does, return your response as a JSON string containing only these three fields: 
    # 1) answer: A simple, terse answer to the question '{event_query}'
    # 2) code: The Python code you generated in python list form, one line of code per item in the list, and 
    # 3) explanation: A brief explanation as to how the code works"""
    
    return question_plus_coda_template.format( event_query=event_query )


event_query = "What events do I have today?"
question = get_question_and_coda( event_query )

for line in question.split( "\n" ):
    print( line )



    Question: What events do I have today?  
    
    Hint: An event that I have today may have started before today and may end tomorrow or next week, so be careful how you filter on dates.
    Hint: When filtering by dates, use `pd.Timestamp( day )` to convert a Python datetime object into a Pandas `datetime64[ns]` value.
    Hint: If your code returns any records, they should include all columns in the dataframe.
    


In [158]:
timer = sw.Stopwatch( "Running pandas prompt..." )
event_query = "What events do I have today?"
question = get_question_and_coda( event_query )
my_response = llm_chain.run( question=question )
timer.print( "Done!" )

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')).


Running pandas prompt...
[32;1m[1;3m[chain/start][0m [1m[1:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "\n    Question: What events do I have today?  \n    \n    Hint: An event that I have today may have started before today and may end tomorrow or next week, so be careful how you filter on dates.\n    Hint: When filtering by dates, use `pd.Timestamp( day )` to convert a Python datetime object into a Pandas `datetime64[ns]` value.\n    Hint: If your code returns any records, they should include all columns in the dataframe.\n    "
}
[32;1m[1;3m[llm/start][0m [1m[1:chain:LLMChain > 2:llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: \nYou are working with a pandas dataframe in Python. The name of the dataframe is `df`.\n\nThis is the ouput from `print(df.head().to_csv())`, in CSV format:\n\n\"start_date\",\"end_date\",\"start_time\",\"end_time\",\"event_type\",\"recurrent\",\"recurrence_interval\",\"priority_level\",\"name\",\"r

In [166]:
response_dict = json.loads( my_response )
response_dict

{'question': 'What events do I have today?',
 'thoughts': "To answer this question, I need to filter the dataframe to only include events that are happening today. This means that the event's start date is on or before today and the end date is on or after today. I will use the pandas Timestamp function to convert today's date into a format that can be compared with the start_date and end_date columns in the dataframe.",
 'code': ['import pandas as pd',
  "today = pd.Timestamp('today').normalize()",
  "solution = df[(df['start_date'] <= today) & (df['end_date'] >= today)]"],
 'return': 'pandas.core.frame.DataFrame',
 'explanation': "First, I import the pandas library. Then, I create a variable 'today' and assign it the current date, normalized to remove the time component. Finally, I filter the dataframe to only include rows where the start date is on or before today and the end date is on or after today. The result is assigned to the variable 'solution'.",
 'answer': "The events I hav

In [134]:
# response_dict[ "code" ] = response_dict[ "code" ][ 0:4 ]
# # response_dict[ "code" ].append( "solution = df_today" )
# for line in response_dict[ "code" ]:
#     print( line )
#     

import pandas as pd
today = pd.Timestamp('today').normalize()
df_today = df[(df['start_date'] <= today) & (df['end_date'] >= today)]
solution = df_today


In [171]:
%autoreload
# Let's copy it until we get the code munging right
response_dict_copy = response_dict[ "code" ][ : ]

response = ulc.assemble_and_run_solution(
    response_dict_copy, path=du.get_project_root() + "/src/conf/long-term-memory/events.csv",
    solution_code_returns=response_dict[ "return" ], debug=False
)
for line in response[ "response" ].split( "\n" ):
    print( line )

"start_date","end_date","start_time","end_time","event_type","recurrent","recurrence_interval","priority_level","name","relationship","description_who_what_where"
"2023-09-03","2023-09-10","19:38","05:22","Interview",False,"","none","Leroy Ruiz","father","Job interview with Leroy Ruiz at Google"
"2023-09-04","2023-09-09","15:26","12:23","Interview",False,"","highest","Inash","girlfriend","Job interview with Inash at Google"
"2023-09-07","2023-09-14","00:00","23:59","Conference",False,"","low","Bob","brother","Conference with Bob on AI advancements"
"2023-09-07","2023-09-14","18:33","18:28","Meeting",False,"","low","Gregorio","friend","Another boring meeting with Gregorio"
"2023-09-09","2023-09-09","00:00","23:59","Birthday",True,"3 day","highest","Alice","aunt","Alice's birthday party at their favorite bar"
"2023-09-09","2023-09-09","00:00","23:59","Conference",False,"","medium","Inash","girlfriend","Conference with Inash on AI advancements"


In [179]:
for line in raw_df.head().to_json( orient="records", lines=True ).split( "\n" ):
    print( line, end="\n\n" )


{"start_date":"2023-08-01","end_date":"2023-08-04","start_time":"00:00","end_time":"23:59","event_type":"Concert","recurrent":false,"recurrence_interval":null,"priority_level":"none","name":"Jenny","relationship":"coworker","description_who_what_where":"Concert of Jenny at the city center"}

{"start_date":"2023-08-01","end_date":"2023-08-01","start_time":"05:25","end_time":"17:22","event_type":"TODO","recurrent":false,"recurrence_interval":null,"priority_level":"highest","name":"Gregorio","relationship":"friend","description_who_what_where":"Send out invitations for the party for Gregorio"}

{"start_date":"2023-08-01","end_date":"2023-08-01","start_time":"13:27","end_time":"01:59","event_type":"Appointment","recurrent":false,"recurrence_interval":null,"priority_level":"high","name":"Leroy Ruiz","relationship":"father","description_who_what_where":"Appointment with Leroy Ruiz at the clinic"}

{"start_date":"2023-08-02","end_date":"2023-08-03","start_time":"13:18","end_time":"02:40","eve

In [181]:
preamble = f"""
I'm going to show you JSONL output from a query on a pandas dataframe. 

JSONL output:

{response[ "response" ]}
"""
#for line in preamble:
#     print( line )
print( preamble )


I'm going to show you JSONL output from a query on a pandas dataframe. 

JSONL output:

"start_date","end_date","start_time","end_time","event_type","recurrent","recurrence_interval","priority_level","name","relationship","description_who_what_where"
"2023-09-03","2023-09-10","19:38","05:22","Interview",False,"","none","Leroy Ruiz","father","Job interview with Leroy Ruiz at Google"
"2023-09-04","2023-09-09","15:26","12:23","Interview",False,"","highest","Inash","girlfriend","Job interview with Inash at Google"
"2023-09-07","2023-09-14","00:00","23:59","Conference",False,"","low","Bob","brother","Conference with Bob on AI advancements"
"2023-09-07","2023-09-14","18:33","18:28","Meeting",False,"","low","Gregorio","friend","Another boring meeting with Gregorio"
"2023-09-09","2023-09-09","00:00","23:59","Birthday",True,"3 day","highest","Alice","aunt","Alice's birthday party at their favorite bar"
"2023-09-09","2023-09-09","00:00","23:59","Conference",False,"","medium","Inash","girlfriend

In [152]:
# preamble = f"""
# I'm going to show you raw CSV output from a query on a pandas events dataframe. 
# 
# Raw CSV output:
# 
# {response[ "response" ]}
# """
# #for line in preamble:
# #     print( line )
# print( preamble )


I'm going to show you raw CSV output from a query on a pandas events dataframe. 

Raw CSV output:

start_date   end_date  ... relationship               description_who_what_where
108 2023-09-03 2023-09-08  ...     coworker       Exciting workshop with John on AGI
109 2023-09-03 2023-09-10  ...       father  Job interview with Leroy Ruiz at Google
112 2023-09-04 2023-09-09  ...   girlfriend       Job interview with Inash at Google
115 2023-09-07 2023-09-14  ...      brother   Conference with Bob on AI advancements
119 2023-09-07 2023-09-14  ...       friend     Another boring meeting with Gregorio

[5 rows x 11 columns]


In [182]:
instructions = f"""
Rephrase this JSONL data in conversational English in such a way that answers this question `{event_query}`

Your output should contain one line per event.
"""
for line in instructions.split( "\n" ):
    print( line )


Rephrase this JSONL data in conversational English in such a way that answers this question `What events do I have today?`

Your output should contain one line per event.


In [184]:
%autoreload
genie_client = gc.GenieClient()
answer_conversational = genie_client.ask_chat_gpt_text( instructions, preamble=preamble, model=gc.GPT_4 )

Asking ChatGPT [gpt-4-0613]...Done! in 5,908 ms


In [185]:
for line in answer_conversational.split( "\n" ):
    print( line )

Assuming today's date is "2023-09-07", here are your events:

1. You have a conference on AI advancements with your brother, Bob, that will last all day.
2. You also have a meeting with your friend, Gregorio, starting at 6:33 PM and ending at 6:28 PM. It seems like it might be a bit boring.


In [186]:
genie_client = gc.GenieClient()
answer_conversational = genie_client.ask_chat_gpt_text( instructions, preamble=preamble, model=gc.GPT_3_5 )

Asking ChatGPT [gpt-3.5-turbo-0613]...Done! in 3,532 ms


In [187]:
# https://chat.openai.com/share/eca4c68e-73b1-4c5c-81c2-151e2216442f
# The very last interaction contains these instructions:
#
# Use the regular expression pattern ^\d+\.\s*:
# 
#     ^: asserts position at the start of a line.
#     \d+: matches one or more digits.
#     \.: matches a literal period (dot).
#     \s*: matches zero or more whitespace characters.

for line in answer_conversational.split( "\n" ):
    line = re.sub( r"^\d+\.\s*", "", line )
    print( f"[{line}]" )

[You have the following events today:]
[]
[Interview with Leroy Ruiz at Google from 19:38 to 05:22.]
[Interview with Inash at Google from 15:26 to 12:23.]
[Conference with Bob on AI advancements from 00:00 to 23:59.]
[Meeting with Gregorio from 18:33 to 18:28.]
[Alice's birthday party at their favorite bar, which is a recurring event starting today and lasting for 3 days.]
[Conference with Inash on AI advancements from 00:00 to 23:59.]


# Prototyping the calendar agent object

In [190]:
import sys
import os
# import json
# 
# os.environ[ "LANGCHAIN_WANDB_TRACING" ] = "true"
# # wandb documentation to configure wandb using env variables
# # https://docs.wandb.ai/guides/track/advanced/environment-variables
# # here we are configuring the wandb project name
# os.environ[ "WANDB_PROJECT" ] = "langchain-dataframe-agent"
# 
# path = "/var/genie-in-the-box/src/lib"
# if path not in sys.path:
#     sys.path.append( path )
# else:
#     print( f"[{path}] already in sys.path" )
# 
# print( sys.path )
# 
# import lib.util as du
# 
# path = "/var/genie-in-the-box/src"
# du.add_to_path( path )
# 
# import lib.util_stopwatch as sw
# # import util_langchain as ulc
# import genie_client as gc
# 
# from langchain.schema import (
#     AIMessage,
#     HumanMessage,
#     SystemMessage
# )
import langchain
# from langchain.chat_models import ChatOpenAI
# from langchain.memory import ConversationBufferMemory
# from langchain.chains import ConversationalRetrievalChain
# from langchain.agents.agent_toolkits.pandas.base import create_pandas_dataframe_agent
# 
# from langchain import PromptTemplate
# from langchain import LLMChain

import pandas as pd


class CalendaringAgent:
    
    def __init__( self, path_to_df, debug=False, verbose=False ):
        langchain.debug = debug
        langchain.verbose = verbose
        
        self.path_to_df = path_to_df
        self.debug = debug
        self.verbose = verbose
        self.df = pd.read_csv( du.get_project_root() + "/src/conf/long-term-memory/events.csv" )
        
        # Please can stay private for the time being, right?!?
        llm_4 = ChatOpenAI( model_name=gc.GPT_4, temperature=0.0 )
        events_df = pd.read_csv( du.get_project_root() + "/src/conf/long-term-memory/events.csv" )
        memory = ConversationBufferMemory( memory_key="chat_history", return_messages=True, verbose=verbose )
        
        self.df_agent = create_pandas_dataframe_agent( llm_4, events_df, verbose=verbose )
        self.df_agent.memory = memory
        
        prompt = PromptTemplate(
            template=self._get_pandas_prompt_template(),
            input_variables=[ "question" ]
        )
        self.llm_chain = LLMChain( prompt=prompt, llm=llm_4 )
    
    def run_prompt( self, question ):
        timer = sw.Stopwatch( "Running pandas prompt..." )
        question_plus_coda = self._get_question_plus_coda( question )
        self.response = self.llm_chain.run( question=question_plus_coda )
        timer.print( "Done!" )
        
        # TODO: Add error handling, because you *know* that something's gonna come back malformed
        # Convert the response to a dictionary, blindly...
        self.response_dict = json.loads( self.response )
        
        return self.response_dict
    
    def _get_pandas_prompt_template( self ):
        pandas_prompt_template = """
        You are working with a pandas dataframe in Python. The name of the dataframe is `df`.

        This is the ouput from `print(df.head().to_csv())`, in CSV format:

        "start_date","end_date","start_time","end_time","event_type","recurrent","recurrence_interval","priority_level","name","relationship","description_who_what_where"
        "2023-07-01","2023-07-04","00:00","23:59","Concert",False,"","none","Jenny","coworker","Concert of Jenny at the city center"
        "2023-07-01","2023-07-01","05:25","17:22","TODO",False,"","highest","Gregorio","friend","Send out invitations for the party for Gregorio"
        "2023-07-01","2023-07-01","13:27","01:59","Appointment",False,"","high","Leroy Ruiz","father","Appointment with Leroy Ruiz at the clinic"
        "2023-07-03","2023-07-04","00:00","23:59","Subscription",True,"1 year","highest","Leroy Ruiz","father","Renewal of Leroy Ruiz's subscription"
        "2023-07-03","2023-07-04","00:00","23:59","Anniversary",True,"4 week","none","Juan","neighbor","Juan's anniversary celebration at the park"
        "2023-07-04","2023-07-04","00:00","23:59","Anniversary",True,"4 week","medium","Leroy Ruiz","father","Leroy Ruiz's anniversary celebration at the park"

        This is the from `print(df.dtypes)`:

        start_date                    datetime64[ns]
        end_date                      datetime64[ns]
        start_time                            object
        end_time                              object
        event_type                            object
        recurrent                               bool
        recurrence_interval                   object
        priority_level                        object
        name                                  object
        relationship                          object
        description_who_what_where            object
        dtype: object

        As you generate the python code needed to answer the question asked of you below, I want you to:

        1) Question: Ask yourself If you understand the question that I am asking you?
        2) Think: Before you do anything, think out loud about what I'm asking you to do, including what are the steps you need to take to solve this problem? What are the things you need to consider? Be critical of your thought process!
        3) Code: Generate a verbatim list of code that you used to arrive at your answer, one line of code per item on the list. The code must be complete, syntactically correct, and run to completion. The last line of your code must be the variable `solution`, which represents the answer.
        4) Return: Report on the object type of the variable `solution` in your last line of code.
        5) Explain: Briefly and succinctly explain your code in plain English.
        6) Answer: Answer the question in a short, simple, terse sentence.

        Format: return your response as a JSON object in the following fields:
        {{
            "question": "the question that your code answers",
            "thoughts": "your thoughts",
            "code": [],
            "return": "Object type of the variable `solution`",
            "explanation": "your brief explanation of your code",
            "answer": "your very succinct answer, in conversational English, one sentence only."
        }}

        Begin!

        Question: {question}
        """
        return pandas_prompt_template
    
    def _get_question_plus_coda( self, event_query ):
        question_plus_coda_template = f"""
        Question: {event_query}

        Hint: An event that I have today may have started before today and may end tomorrow or next week, so be careful how you filter on dates.
        Hint: When filtering by dates, use `pd.Timestamp( day )` to convert a Python datetime object into a Pandas `datetime64[ns]` value.
        Hint: If your code returns any records, they should include all columns in the dataframe.
        """
        return question_plus_coda_template


# Add main method
# if __name__ == "__main__":

agent = CalendaringAgent( path_to_df=du.get_project_root() + "/src/conf/long-term-memory/events.csv", debug=False )
response = agent.run_prompt( "What events do I have today?" )

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')).


Running pandas prompt...
Running pandas prompt... Done! in 20 seconds


In [194]:
response = json.loads( response )
print( json.dumps( response, indent=4 ) )

{
    "question": "What events do I have today?",
    "thoughts": "To answer this question, I need to filter the dataframe to only include events that are happening today. This means that the event's start date is on or before today and the end date is on or after today. I will use the pandas Timestamp function to convert today's date into a format that can be compared with the dates in the dataframe. I will then use boolean indexing to filter the dataframe.",
    "code": [
        "import pandas as pd",
        "from datetime import datetime",
        "today = pd.Timestamp(datetime.now())",
        "solution = df[(df['start_date'] <= today) & (df['end_date'] >= today)]"
    ],
    "return": "pandas.core.frame.DataFrame",
    "explanation": "The code first imports the necessary libraries. It then gets today's date and time and converts it into a pandas Timestamp object. The dataframe is then filtered to only include rows where the start date is on or before today and the end date is on

In [193]:
response_dict = json.loads( response )
response_dict[ "code" ]

['import pandas as pd',
 'from datetime import datetime',
 'today = pd.Timestamp(datetime.now())',
 "solution = df[(df['start_date'] <= today) & (df['end_date'] >= today)]"]