In [1]:
! pip install tabulate

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0
[0m

In [2]:
%load_ext autoreload

In [3]:
%autoreload

import sys
import os
import json

# os.environ[ "LANGCHAIN_WANDB_TRACING" ] = "true"
# # wandb documentation to configure wandb using env variables
# # https://docs.wandb.ai/guides/track/advanced/environment-variables
# # here we are configuring the wandb project name
# os.environ[ "WANDB_PROJECT" ] = "deepily-dataframe-agent"

path = "/var/genie-in-the-box/src/lib"
if path not in sys.path:
    sys.path.append( path )
else:
    print( f"[{path}] already in sys.path" )

print( sys.path )

import util as du

path = "/var/genie-in-the-box/src"
du.add_to_path( path )

import util_pandas    as up
import util_stopwatch as sw
import util_langchain as ulc
import genie_client   as gc

import pandas as pd

debug = True
verbose = True


['/Applications/PyCharm.app/Contents/plugins/python/helpers-pro/jupyter_debug', '/Applications/PyCharm.app/Contents/plugins/python/helpers/pydev', '/var/genie-in-the-box/src/notebooks', '/usr/local/lib/python310.zip', '/usr/local/lib/python3.10', '/usr/local/lib/python3.10/lib-dynload', '', '/usr/local/lib/python3.10/site-packages', '/var/genie-in-the-box/src/lib']
Added [/var/genie-in-the-box/src] to sys.path
pwd [/var/genie-in-the-box/src/notebooks]

/Applications/PyCharm.app/Contents/plugins/python/helpers-pro/jupyter_debug
/Applications/PyCharm.app/Contents/plugins/python/helpers/pydev
/usr/local/lib/python3.10
/usr/local/lib/python3.10/lib-dynload
/usr/local/lib/python3.10/site-packages
/usr/local/lib/python310.zip
/var/genie-in-the-box/src
/var/genie-in-the-box/src/lib
/var/genie-in-the-box/src/notebooks


In [6]:
df = pd.read_csv( du.get_project_root() + "/src/conf/long-term-memory/events.csv" )
df = up.cast_to_datetime( df, debug=debug )


------------------------------------------------------------------------------------------------------------------------
- df.dtypes:
------------------------------------------------------------------------------------------------------------------------
start_date                    datetime64[ns]
end_date                      datetime64[ns]
start_time                            object
end_time                              object
event_type                            object
recurrent                               bool
recurrence_interval                   object
priority_level                        object
name                                  object
relationship                          object
description_who_what_where            object
dtype: object


In [None]:
import openai

In [40]:
GPT_4   = "gpt-4-0613"
GPT_3_5 = "gpt-3.5-turbo-0613"

def ask_chat_gpt_text( preamble, query, model=GPT_4, debug=False ):

    openai.api_key = os.getenv( "FALSE_POSITIVE_API_KEY" )
    
    timer = sw.Stopwatch( msg=f"Asking ChatGPT [{model}]...".format( model ) )
    
    response = openai.ChatCompletion.create(
        model=model,
        messages=[ 
            { "role": "system", "content": preamble },
            { "role": "user",   "content": query } 
        ],
        temperature=0,
        max_tokens=2400,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0
    )
    timer.print( use_millis=True )
    
    if debug: print( json.dumps( response, indent=4 ) )
    
    return response[ "choices" ][ 0 ][ "message" ][ "content" ].strip()

preamble = "You are chatGPT, a helpful chatbot"
question = "What's your name?"

ask_chat_gpt_text( preamble, question, model=GPT_4 )

Asking ChatGPT [gpt-4-0613]...
Asking ChatGPT [gpt-4-0613]... in 1,338 ms


"My name is OpenAI's GPT-3, but you can call me ChatGPT."

In [35]:
%autoreload

import os
import json

import lib.util           as du
import lib.util_pandas    as dup
import lib.util_stopwatch as sw
import genie_client       as gc

import pandas as pd

class CalendaringAgent:
    
    def __init__( self, path_to_df, debug=False, verbose=False ):
        
        self.debug                = debug
        self.verbose              = verbose
        self.path_to_df           = du.get_project_root() + path_to_df
        self.df                   = pd.read_csv( self.path_to_df )
        self.df                   = dup.cast_to_datetime( self.df )
    
        self.pandas_system_prompt = self._get_pandas_system_prompt()
        
        self.response_dict        = None 
        self.question             = None
        
    def _get_pandas_system_prompt( self ):
        
        csv = self.df.head( 3 ).to_csv( header=True, index=False)
        csv = csv + self.df.tail( 3 ).to_csv( header=False, index=False )
        
        pandas_system_prompt = f"""
        You are an expert software engineer working with a pandas dataframe in Python containing calendaring and events information. The name of the dataframe is `df`.

        This is the ouput from `print(df.head().to_csv())`, in CSV format:

        {csv}
        
        This is the output from `print(self.df.event_type.value_counts())`:

        {self.df.event_type.value_counts()}

        As you generate the python code needed to answer the events and calendaring question asked of you below, I want you to:

        1) Question: Ask yourself if you understand the question that I am asking you.  Pay attention to the details!
        2) Think: Before you do anything, think out loud about what I'm asking you to do, including what are the steps that you will need to take to solve this problem. Be critical of your thought process!
        3) Code: Generate a verbatim list of code that you used to arrive at your answer, one line of code per item on the list. The code must be complete, syntactically correct, and capable of runnning to completion. The last line of your code must be the variable `solution`, which represents the answer. Make sure that any filtering you perform matches the question asked of you by the user!
        4) Return: Report on the object type of the variable `solution` in your last line of code. Use one word to represent the object type.
        5) Explain: Briefly and succinctly explain your code in plain English.
        
        Format: return your response as a JSON object in the following fields:
        {{
            "question": "The question, verbatim and without modification, that your code attempts to answer",
            "thoughts": "Your thoughts",
            "code": [],
            "return": "Object type of the variable `solution`",
            "explanation": "A brief explanation of your code",
        }}

        Hint: An event that I have today may have started before today and may end tomorrow or next week, so be careful how you filter on dates.
        Hint: When filtering by dates, use `pd.Timestamp( day )` to convert a Python datetime object into a Pandas `datetime64[ns]` value.
        Hint: If your solution variable is a dataframe, it should include all columns in the dataframe.
        
        Wait until you're presented with the question to begin.
        """
        if self.debug: print( pandas_system_prompt )
        
        count = get_token_count( pandas_system_prompt )
        print( f"Token count for pandas_system_prompt: [{count}]" )
        
        return pandas_system_prompt
    
    def run_prompt( self, question ):
        
        # timer = sw.Stopwatch( "Running pandas prompt..." )
        self.question      = question
        self.response      = ask_chat_gpt_text( self.pandas_system_prompt, self.question, model=GPT_4 )
        self.response_dict = json.loads( self.response )
        # timer.print( "Done!" )
        
        if self.debug: print( json.dumps( self.response_dict, indent=4 ) )
        
        return self.response_dict
        
    def run_code( self ):
        
        # Let's copy it until we get the code munging right
        response_dict_copy = self.response_dict[ "code" ][ : ]
        
        self.code_response = ulc.assemble_and_run_solution(
            response_dict_copy, path="/src/conf/long-term-memory/events.csv",
            solution_code_returns=response_dict[ "return" ], debug=False
        )
        if self.debug and self.verbose:
            du.print_banner( "Code output", prepend_nl=True )
            for line in self.code_response[ "output" ].split( "\n" ):
                print( line )
                
        return self.code_response     
    
    def format_output( self ):
        
        preamble     = self.get_formatting_preamble()
        instructions = self.get_formatting_instructions()
        
        self.formatted_output = ask_chat_gpt_text( preamble, instructions, model=GPT_3_5, debug=True )
        
        return self.formatted_output

    def get_formatting_preamble( self ):
        
        rows        = self.code_response[ "output" ].split( "\n" )
        row_count   = len( rows )
        
        lines       = []
        line_number = 1
        
        for row in rows:
            
            lines.append( f"{line_number}) {row}" )
            line_number += 1
            
        lines = "\n".join( lines )
        
        preamble = f"""
        You are an expert in converting raw data into conversational English.
        
        The following {row_count} rows of JSONL formatted data are the output from a query on a pandas dataframe about events on my calendar. 
        
        JSONL output:
        
        {lines}
        """
        return preamble
    
    def get_formatting_instructions( self ):
        
        instructions = f"""
        Reformat and rephrase the JSONL data that I just showed you in conversational English so that it answers this question: `{self.question}`
        
        Use this format: "You have a two hour lunch date with your friend Bob at noon today at Burgerland.
        
        Each line of the output that you create should contain one event.
        There is no need to discuss priority of events.
        If the query returned zero rows of JSONL data, then say "You have no EVENT_TYPE events today/tomorrow/this week, etc."
        """
        return instructions

In [37]:
! pip install tiktoken

[0m

In [36]:
agent         = CalendaringAgent( path_to_df="/src/conf/long-term-memory/events.csv", debug=True )

response_dict = agent.run_prompt( "What todo items do I have on my calendar for this week?" )
# print( response_dict )


        You are an expert software engineer working with a pandas dataframe in Python containing calendaring and events information. The name of the dataframe is `df`.

        This is the ouput from `print(df.head().to_csv())`, in CSV format:

        start_date,end_date,start_time,end_time,event_type,recurrent,recurrence_interval,priority_level,name,relationship,description_who_what_where
2023-08-01,2023-08-04,00:00,23:59,Concert,False,,none,Jenny,coworker,Concert of Jenny at the city center
2023-08-01,2023-08-01,05:25,17:22,TODO,False,,highest,Gregorio,friend,Send out invitations for the party for Gregorio
2023-08-01,2023-08-01,13:27,01:59,Appointment,False,,high,Leroy Ruiz,father,Appointment with Leroy Ruiz at the clinic
2023-10-01,2023-10-01,00:00,23:59,Birthday,True,3 day,low,Bob,brother,Bob's birthday party at their favorite bar
2023-10-01,2023-10-01,00:00,23:59,Anniversary,True,3 week,highest,Tom Ruiz,brother,Tom Ruiz's anniversary celebration at the park
2023-10-01,2023-10-04

In [38]:
import tiktoken

def get_token_count( to_be_tokenized, model=GPT_4 ):
    
    enc = tiktoken.encoding_for_model( model )
    num_tokens = len( enc.encode( to_be_tokenized ) )
    
    return num_tokens

get_token_count( agent.pandas_system_prompt )

856

In [41]:
%autoreload
code_response = agent.run_code()
code_response[ "output" ]

'{"start_date":1695513600000,"end_date":1696032000000,"start_time":"12:01","end_time":"04:55","event_type":"TODO","recurrent":false,"recurrence_interval":null,"priority_level":"medium","name":"Juan","relationship":"neighbor","description_who_what_where":"Prepare presentation for next week\'s meeting for Juan (neighbor)"}\n{"start_date":1695772800000,"end_date":1695772800000,"start_time":"18:55","end_time":"05:58","event_type":"TODO","recurrent":false,"recurrence_interval":null,"priority_level":"low","name":"Pablo","relationship":"friend","description_who_what_where":"Prepare presentation for next week\'s meeting for Pablo"}\n{"start_date":1695945600000,"end_date":1696464000000,"start_time":"16:58","end_time":"13:49","event_type":"TODO","recurrent":false,"recurrence_interval":null,"priority_level":"medium","name":"Juan","relationship":"neighbor","description_who_what_where":"Renew gym membership for Juan (neighbor)"}'

In [42]:
%autoreload
for line in agent.get_formatting_preamble().split( "\n" ):
    print( line )


        You are an expert in converting raw data into conversational English.
        
        The following 3 rows of JSONL formatted data are the output from a query on a pandas dataframe about events on my calendar. 
        
        JSONL output:
        
        1) {"start_date":1695513600000,"end_date":1696032000000,"start_time":"12:01","end_time":"04:55","event_type":"TODO","recurrent":false,"recurrence_interval":null,"priority_level":"medium","name":"Juan","relationship":"neighbor","description_who_what_where":"Prepare presentation for next week's meeting for Juan (neighbor)"}
2) {"start_date":1695772800000,"end_date":1695772800000,"start_time":"18:55","end_time":"05:58","event_type":"TODO","recurrent":false,"recurrence_interval":null,"priority_level":"low","name":"Pablo","relationship":"friend","description_who_what_where":"Prepare presentation for next week's meeting for Pablo"}
3) {"start_date":1695945600000,"end_date":1696464000000,"start_time":"16:58","end_time":"13:49","

In [43]:
for line in agent.get_formatting_instructions().split( "\n" ):
    print( line )


        Reformat and rephrase the JSONL data that I just showed you in conversational English so that it answers this question: `What todo items do I have on my calendar for this week?`
        
        Use this format: "You have a two hour lunch date with your friend Bob at noon today at Burgerland.
        
        Each line of the output that you create should contain one event.
        There is no need to discuss priority of events.
        If the query returned zero rows of JSONL data, then say "You have no EVENT_TYPE events today/tomorrow/this week, etc."
        


In [44]:
agent.format_output()

Asking ChatGPT [gpt-3.5-turbo-0613]...
Asking ChatGPT [gpt-3.5-turbo-0613]... in 6,304 ms


"You have a medium priority task to prepare a presentation for next week's meeting with your neighbor Juan. This task is scheduled from 12:01 PM to 4:55 PM today.\n\nYou have a low priority task to prepare a presentation for next week's meeting with your friend Pablo. This task is scheduled from 6:55 PM today to 5:58 AM tomorrow.\n\nYou have a medium priority task to renew your gym membership for your neighbor Juan. This task is scheduled from 4:58 PM today to 1:49 PM on the day after tomorrow."

In [45]:
for line in agent.formatted_output.split( "\n" ): 
    print( line )

You have a medium priority task to prepare a presentation for next week's meeting with your neighbor Juan. This task is scheduled from 12:01 PM to 4:55 PM today.

You have a low priority task to prepare a presentation for next week's meeting with your friend Pablo. This task is scheduled from 6:55 PM today to 5:58 AM tomorrow.

You have a medium priority task to renew your gym membership for your neighbor Juan. This task is scheduled from 4:58 PM today to 1:49 PM on the day after tomorrow.
