# Test BLOOM LLM

In [1]:
cd ..

/home/p300488/cognitive_robotics_LLM_planning/cognitive_robotics_LLM_based_manipulation


In [18]:
# helpers
import time
from pygments import highlight
from pygments.lexers import PythonLexer
from pygments.formatters import HtmlFormatter
from IPython.display import display, HTML

pprint = lambda s: display(HTML(highlight(s, PythonLexer(), HtmlFormatter(full=True))))

In [19]:
from getpass import getpass

# get your own from Hugging Hub
hugging_hub_token = getpass()

 ········


In [20]:
from huggingface_hub import InferenceApi

llm_inference = InferenceApi("bigscience/bloom", token=hugging_hub_token)

def BLOOM(       query,
                 prompt,
                 stop_tokens = None,
                 max_length = 128,
                 temperature=1.,
                 return_full_text = False,
                 verbose = False
):
    
    new_prompt = f'{prompt}\n{query}\n'
    
    params = {
        "max_new_tokens": max_length,
        "top_k": None,
        "top_p": None,
        "temperature": temperature,
        "do_sample": False,
        "seed": 42, #useless
        "early_stopping":None,
        "no_repeat_ngram_size":None,
        "num_beams":None,
        "return_full_text":return_full_text,
        'wait_for_model' : True
    }
    s = time.time()
    response = llm_inference(new_prompt, params=params)
    proc_time = time.time()-s
    if verbose:
        print(f"Inference time {proc_time} seconds")
        
    if isinstance(response, dict):
        assert list(response.keys()) == ['error']
        raise ValueError(f'sth went wrong with prompt {new_prompt}')

    response = response[0]['generated_text']
    #response = response[(response.find(query) + len(query) + 1):]

    if stop_tokens is not None:
        if verbose:
            print('Stopping')
        for stoken in stop_tokens:
            if stoken in response:
                response = response[:response.index(stoken)]


    return response

### Socratic style (via detection)

The following is the standard prompt as presented in [this work](https://socraticmodels.github.io/). It first runs an object detector to identify all objects in the scene, puts it in a string like `objects=[obj1, obj2, ...]` and feeds that string as context to the LLM. Every time a `object=[...]` line is passed, the LLM should understand that means the beginning of a new scene.


Then the LLM calls the robot primitives to perform the action. The LLM can understand from the language instruction (given in `#`) in the prompt, what objects it should pick and where to place and returns a program-like plan for manipulation.

In [23]:
prompt_pick_and_place_detection = """
objects = ["scissors", "pear", "hammer", "mustard bottle", "tray"]
# put the bottle to the left side.
robot.pick_and_place("mustard bottle", "left side")

objects = ["banana", "foam brick", "strawberry", "tomato soup can", "pear", "tray"]
# move the fruit to the bottom right corner.
robot.pick_and_place("banana", "bottom right corner")
robot.pick_and_place("pear", "bottom right corner")
robot.pick_and_place("strawberry", "bottom right corner")
# now put the green one in the top side.
robot.pick_and_place("pear", "top side")
# undo the last step.
robot.pick_and_place("pear", "bottom right corner")

objects = ["potted meat can", "power drill", "chips can", "hammer", "tomato soup can", "tray"]
# put all cans in the tray.
robot.pick_and_place("potted meat can", "tray")
robot.pick_and_place("chips can", "tray")
robot.pick_and_place("tomato soup can", "tray")
""".strip()

pprint(prompt_pick_and_place_detection)

In [24]:
# examples
context = 'objects = ["banana", "smartphone", "strawberry", "avocado", "tomato soup can", "mustard bottle", "laptop", "tray"]'

queries = ['move the tomato to the middle.',
           'move the bottle to the tray.',
           'put the electronic devices in the tray.',
           'put the green fruit to the top side',
           'get the canned food products to the bottom left corner'
]

for q in queries:
    q = '\n' + context + '\n' + '# ' + q
    resp = BLOOM(q, prompt_pick_and_place_detection, max_length=128, stop_tokens=['#', 'objects = [']).strip()
    pprint(q + '\n' + resp)
    print('--' * 48)

------------------------------------------------------------------------------------------------


------------------------------------------------------------------------------------------------


------------------------------------------------------------------------------------------------


------------------------------------------------------------------------------------------------


------------------------------------------------------------------------------------------------


Apparently avocado is a canned food product :P However, you see the generalization capabilities of the LLM here to determine what to manipulate.

### Code style (via grounding)

The above requires to detect and recognize all objects in the scene, which is not necessary and can lead to errors if objects are mis-classified. You can use another prompt structure to make the LLM call CLIP to do the grounding on any given scene, and then manipulate. The LLM output looks more like Python code with variable assignments, instead of simple sequences of `pick_and_place`. The `objects=scene_init()` corresponds to calling your segmenter (either ground-truth from Pybullet or other method) to get a list of all object masks, without any category-language information. Then CLIP will ground the correct object from the language instruction with the `find` primitive.

In [60]:
prompt_pick_and_place_grounding = """
from robot_utils import pick_and_place
from camera_utils import find, scene_init

objects = scene_init()
# put the bottle to the left side.
bottle = find(objects, "bottle")[0]
pick_and_place(bottle, "left side")

objects = scene_init()
# move the fruit to the bottom right corner.
fruits = find(objects, "fruit")
for fruit_instance in fruits:
	pick_and_place(fruit_instance, "bottom right corner")
# now put the red one in the right side.
red_fruit = find(fruits, "red")[0]
pick_and_place(red_fruit, "right side")
# undo the last step.
pick_and_place(green_fruit, "bottom right corner")

objects = scene_init()
# put all cans in the tray.
cans = find(objects, "can")
for can_instance in cans:
	pick_and_place(can_instance, "tray")
""".strip()

pprint(prompt_pick_and_place_grounding)

In [61]:
# examples
queries = ['move the tomato to the middle.',
           'move the bottle to the tray.',
           'put the electronic devices in the tray.',
           'put the green fruit to the top side',
           'get the canned food products to the bottom left corner'
]

for q in queries:
    q = "\nobjects = scene_init()" + '\n' + '# ' + q
    resp = BLOOM(q, prompt_pick_and_place_grounding, max_length=128, stop_tokens=['#', 'objects = ']).strip()
    pprint(q + '\n' + resp)
    print('--' * 48)

------------------------------------------------------------------------------------------------


------------------------------------------------------------------------------------------------


------------------------------------------------------------------------------------------------


------------------------------------------------------------------------------------------------


------------------------------------------------------------------------------------------------


### Add chat history

By adding query-response history you can interact with the LLM-based robot agent like with a chatbot.

In [63]:
# examples
context = 'objects = ["banana", "apple", "hammer", "potted meat can", "tomato soup can", "tray"]'

queries = ['move the banana to the middle.',
           'now put the other fruit in the top right corner.',
           'put the cans in the same corner as before.',
           'now put the red one in the opposite corner.',
           'wait, undo the last step.'
]

history = '\n' + context
for q in queries:
    qq = history + '\n' + '# ' + q
    resp = BLOOM(qq, prompt_pick_and_place_detection, max_length=128, stop_tokens=['#', 'objects = [']).strip()
    pprint(qq + '\n' + resp)
    print('--' * 48)
    history += '\n# ' + q + '\n' + resp

------------------------------------------------------------------------------------------------


------------------------------------------------------------------------------------------------


------------------------------------------------------------------------------------------------


------------------------------------------------------------------------------------------------


------------------------------------------------------------------------------------------------
