In [1]:
import openai
import os
from os import walk
import json
import pandas as pd

pd.set_option( "display.width", 512)
pd.set_option( "display.max_columns", 6 )
# import sys
# import datetime as dt

In [2]:
import datetime as dt
import os

# from lib import util as du


class Stopwatch:

    def __init__( self ):

        self.start_time = dt.datetime.now()

    def print( self, msg=None, prepend_nl=False, end="\n\n", use_millis=False ):

        """
        Prints time elapsed since instantiation

        If more than 1 minute has passed it uses "mm:ss" format.  Otherwise, it just prints seconds

        ¡OJO!/NOTE: This is fairly simpleminded, it's probably more accurate to use timeit

        :param msg: Text to the output before elapsed time is reported

        :param prepend_nl: Insert a new line before printing to the console, defaults to False

        :param end: Optional text to append to the end of the output, similar to how print works in the standard library.  Defaults to two carriage turns

        :param use_millis: Dump elapsed time in milliseconds to the console. Faults to False

        :return: None, Prince to console only
        """

        seconds = (dt.datetime.now() - self.start_time).seconds

        # check msg argument
        if msg is None: msg = "Finished"

        # preformat output
        if prepend_nl: print()

        if use_millis:

            # From: https://stackoverflow.com/questions/766335/python-speed-testing-time-difference-milliseconds
            delta = dt.datetime.now() - self.start_time
            millis = int( delta.total_seconds() * 1000 )

            print( "{0} in {1:,} ms".format( msg, millis ), end=end )

        elif seconds > 59:

            # From: https://stackoverflow.com/questions/775049/how-do-i-convert-seconds-to-hours-minutes-and-seconds
            minutes, seconds = divmod( seconds, 60 )
            print( "{0} in {1:02d}:{2:02d}".format( msg, minutes, seconds ), end=end )

        else:
            print( "{0} in {1:,} seconds".format( msg, seconds ), end=end )

    def get_delta( self ):

        """
        Calculate the delta between now and when this object was instantiated

        :return: Time delta in milliseconds
        """

        delta = dt.datetime.now() - self.start_time
        millis = int( delta.total_seconds() * 1000 )

        return millis


In [3]:
timer = Stopwatch()
timer.print( "Finished doing foo", use_millis=True )

Finished doing foo in 0 ms



In [16]:
# MODEL = "gpt-3.5-turbo"

def ask_chat_gpt_using_raw_prompt_and_content( prompt_and_content, debug=False ):

    openai.api_key = os.getenv( "FALSE_POSITIVE_API_KEY" )
    print( "Using FALSE_POSITIVE_API_KEY [{}]".format( os.getenv( "FALSE_POSITIVE_API_KEY" ) ) )

    prompt  = prompt_and_content.split( "```" )[ 0 ].strip()
    content = prompt_and_content.split( "```" )[ 1 ].strip()

    if debug:
        print( " prompt [{}]".format( prompt ) )
        print( "content [{}]".format( content ) )

    response = openai.chat.completions.create(
        model="gpt-3.5-turbo-0613",
        messages=[
            { "role": "system", "content": prompt },
            { "role": "user", "content": content }
        ],
        # From: https://community.openai.com/t/cheat-sheet-mastering-temperature-and-top-p-in-chatgpt-api-a-few-tips-and-tricks-on-controlling-the-creativity-deterministic-output-of-prompt-responses/172683
        # Using creative writing values
        temperature=0.5,
        top_p=0.5,
        max_tokens=3400,
        # From: https://community.openai.com/t/difference-between-frequency-and-presence-penalties/2777/2
        # frequency_penalty=0.5,
        # presence_penalty=0.5
        # # Non-zero values break JSON formatting... Wuh?!?
        frequency_penalty=0.0,
        presence_penalty=0.0
    )
    if debug: print( response )

    return response[ "choices" ][ 0 ][ "message" ][ "content" ].strip()



In [22]:
path = "/var/genie-in-the-box/src/prompts"

def get_prompt_paths( path ):

    # From: https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory
    filenames = next( walk( path ), ( None, None, [ ] ) )[ 2 ]
    paths     = [ ]
    for filename in filenames:

        if filename.startswith( "synthetic-data-" ) and filename.endswith( ".txt" ):
            paths.append( "/var/genie-in-the-box/src/prompts/" + filename )

    paths.sort()
    return paths

paths = get_prompt_paths( path )
paths

['/var/genie-in-the-box/src/prompts/synthetic-data-generation-synonymous-load-url-in-current-tab.txt',
 '/var/genie-in-the-box/src/prompts/synthetic-data-generation-synonymous-load_url_in_new_tab.txt',
 '/var/genie-in-the-box/src/prompts/synthetic-data-generation-synonymous-search_current_tab.txt',
 '/var/genie-in-the-box/src/prompts/synthetic-data-generation-synonymous-search_google_current_tab.txt',
 '/var/genie-in-the-box/src/prompts/synthetic-data-generation-synonymous-search_google_new_tab.txt',
 '/var/genie-in-the-box/src/prompts/synthetic-data-generation-synonymous-search_google_scholar_new_tab.txt',
 '/var/genie-in-the-box/src/prompts/synthetic-data-generation-synonymous-search_new_tab.txt']

In [23]:
def write_to_temp_file( data ):

    temp_file = "/tmp/foo.json"
    with open( temp_file, "w" ) as outfile:
        json.dump( data, outfile, indent=4 )

    return temp_file

In [24]:
# Write JSON through file, read it back in as a data frame
def convert_jsons_to_df( json_str ):

    temp_file = write_to_temp_file( json.loads( json_str ) )
    df        = pd.read_json( temp_file )

    return df

In [25]:
# Write JSON through file, read it back in as a data frame, comma extract the system and voice commands, and return them
def get_system_and_voice_commands( json_str ):

    df = convert_jsons_to_df( json_str )

    system_command = df[ "system_command" ].unique()[ 0 ]
    voice_command  = df[ "voice_command"  ].unique()[ 0 ]

    return system_command, voice_command


In [18]:
def get_synthetic_data( paths ):

    responses = [ ]
    prompts   = [ ]
    commands  = [ ]
    for path in paths[ 0:1 ]:

        # prompt = open( path, "r" ).read()
        # This is more pythonic
        with open( path, "r" ) as f:
           prompt = f.read()

        prompts.append( prompt )

        print( "prompt [{}]".format( prompt ) )

        system_command, voice_command = get_system_and_voice_commands( prompt.split( "```" )[ 1 ] )
        commands.append( ( system_command, voice_command ) )

        print( "system_command [{}]".format( system_command ) )
        print( "voice_command  [{}]".format( voice_command  ) )

        timer = Stopwatch()
        response = ask_chat_gpt_using_raw_prompt_and_content( prompt, debug=True ).replace( "```", "" )
        timer.print( "Finished asking [{}]".format( "GPT 3.5 turbo" ), use_millis=True )

        print( response )
        # df = convert_jsons_to_df( response )
        # # print( df )
        # responses.append( df )
        responses.append( response )

    return prompts, commands, responses

prompts, commands, responses = get_synthetic_data( paths )

prompt [you are an UX expert giving explicit commands to a web browser that understands human speech.

generate 10 random and diverse browser commands in sentence form that are unambiguously synonymous with the voice_command below delimited by three * symbols.
create a randomly generated domain name for each sentence.

return the results in json format, using the following descriptions for each field in the json output:

- "id": indicates the ordinal position of the sentence, the first sentence should have an id of 1, the second sentence should have an id of 2, etc.
- "system_command": this is a constant value, use the system_command below delimited by three _ symbols, DO NOT CHANGE THIS VALUE.
- "voice_command": this is a constant value, use the voice_command below delimited by three * symbols, DO NOT CHANGE THIS VALUE.
- "synonymous_command": contains the random and diversely generated sentences. this is the only text field that should vary in the json output. this field MUST vary si

In [27]:
# responses[ 0 ]
# responses[ 0 ] = responses[ 0 ].replace( "```", "" )
# json.loads( responses[ 0 ].replace( "```", "" ) )

response_dfs = [ ]

for response in responses:
    df = convert_jsons_to_df( response )
    response_dfs.append( df )
    print( df )

   id       system_command                     voice_command                                 synonymous_command
0   1  open in current tab  load this url in the current tab            open www.example.com in the current tab
1   2  open in current tab  load this url in the current tab                      go to www.foo.com in this tab
2   3  open in current tab  load this url in the current tab               visit www.bar.com in the current tab
3   4  open in current tab  load this url in the current tab                           view baz.org in this tab
4   5  open in current tab  load this url in the current tab       please take me to baz.org in the current tab
5   6  open in current tab  load this url in the current tab                          load blah.org in this tab
6   7  open in current tab  load this url in the current tab  navigate to www.randomsite.com in the current tab
7   8  open in current tab  load this url in the current tab                access www.testsite.com in t

In [17]:
foo = """
blah blah blah blah blah blah and furthermore blah!
```
[
    {
        "id": 2,
        "system_command": "open in current tab",
        "voice_command": "load this url in the current tab",
        "synonymous_command": "open www.foo.com in this tab please"
    },
    {
        "id": 3,
        "system_command": "open in current tab",
        "voice_command": "load this url in the current tab",
        "synonymous_command": "visit www.bar.com"
    },
    {
        "id": 4,
        "system_command": "open in current tab",
        "voice_command": "load this url in the current tab",
        "synonymous_command": "view baz.org in current tab"
    },
    {
        "id": 5,
        "system_command": "open in current tab",
        "voice_command": "load this url in the current tab",
        "synonymous_command": "please take me to baz.org in this tab"
    }
]
```
"""

In [23]:
json_str = foo.split( "```" )[ 1 ].strip()
df       = convert_jsons_to_df( json_str )
df

Unnamed: 0,id,system_command,voice_command,synonymous_command
0,2,open in current tab,load this url in the current tab,open www.foo.com in this tab please
1,3,open in current tab,load this url in the current tab,visit www.bar.com
2,4,open in current tab,load this url in the current tab,view baz.org in current tab
3,5,open in current tab,load this url in the current tab,please take me to baz.org in this tab


In [24]:
system_command, voice_command = get_system_and_voice_commands( json_str )
system_command, voice_command

('open in current tab', 'load this url in the current tab')