In [1]:
import json
import sng_parser
from pprint import pprint

In [2]:
annotations = json.load(open('mapping.json', 'r'))
objects = json.load(open('objects.json', 'r'))

In [3]:
annotations['001YG']['000089']

{'bbox': [[222.1, 143.83, 479.88, 244.94], [24.3, 71.44, 259.24, 268.2]],
 'bbox_labels': ['table', 'person'],
 'rel_pairs': [['person', 'table']],
 'rel_labels': ['in_front_of'],
 'actions': ['sit at a table']}

In [4]:
phrase = objects['001YG']['000089']

In [5]:
phrase['labels']

['laptop',
 'table',
 'chair',
 'phone',
 'blanket towel',
 'pillow',
 'bottle',
 'bottle',
 'pillow',
 'box book',
 'person',
 'television',
 'sofa couch',
 'clothes towel',
 'pillow',
 '##knob',
 'bed']

In [6]:
all_obj = set("person,broom,picture,closet,cabinet,blanket,window,table,paper,notebook,refrigerator,pillow,cup,glass,bottle,shelf,shoe,medicine,phone,camera,box,sandwich,book,bed,clothes,mirror,sofa,couch,floor,bag,dish,laptop,door,towel,food,chair,doorknob,doorway,groceries,hands,light,vacuum,television".split(','))

In [7]:
filtered = set()
for item in phrase['labels']:
    arr = item.split(' ')
    if arr[0] in all_obj:
        filtered.add(arr[0])
filtered

{'bed',
 'blanket',
 'bottle',
 'box',
 'chair',
 'clothes',
 'laptop',
 'person',
 'phone',
 'pillow',
 'sofa',
 'table',
 'television'}

In [8]:
llava_prompt_1 = "Find important positional relations among only these objects: " + ','.join(filtered)
llava_prompt_1

'Find important positional relations among only these objects: box,pillow,blanket,table,person,bed,bottle,clothes,phone,sofa,television,chair,laptop'

In [9]:
llava_return_1 = "In the image, the person is sitting on a chair in front of a table, which has a laptop on it. The table is located next to a bed, and there is a bottle on the table as well. A sofa is also present in the room, and a television is positioned nearby. A phone is placed on the table, and a blanket and clothes are located on the bed. The person is wearing a blue shirt, and there is a computer mouse on the table."
llava_return_1

'In the image, the person is sitting on a chair in front of a table, which has a laptop on it. The table is located next to a bed, and there is a bottle on the table as well. A sofa is also present in the room, and a television is positioned nearby. A phone is placed on the table, and a blanket and clothes are located on the bed. The person is wearing a blue shirt, and there is a computer mouse on the table.'

In [None]:
llava_return = [("person", "sitting", "chair"), ("person", "in front of", "table"), ("table", "on", "floor")]

In [None]:
Sure, here are the tuples of relations for the given input text:

[
("person", "sitting", "chair"),
("table", "in_front_of", "person"),
("laptop", "on", "table"),
("table", "next_to", "bed"),
("bed", "near", "sofa"),
("television", "positioned", "nearby"),
("phone", "on", "table"),
("blanket", "on", "bed"),
("clothes", "on", "bed"),
("person", "wearing", "blue_shirt"),
("computer_mouse", "on", "table")
]

Note that I've used the relation "next_to" instead of "beside" since it's not in the allowed relation array. Also, I've ignored the word "also" in the sentence "A sofa is also present in the room" since it doesn't contribute to any relevant relation.

In [None]:
Generate tuples [Entity, Relation, Entity] from quoted text which is describing the image. The tuples show most important relations between two entities in the sentences. 

Requirements:
1. The relation are limited in following array: [on, behind, in_front_of, on_the_side_of, above, beneath, drinking_from, have_it_on_the_back, wearing, holding, lying_on, covered_by, carrying, eating, leaning_on, sitting_on, twisting, writing_on, standing_on, touching, wiping, at, under, near] You should only use relations I give you. If the relation is not in the array, then discard the tuple or try to find relation with similar meaning.
2. Do not infer or assume relations. Only depend on sentences themself.
3. Use a valid Json format to form the output. Start and end with curly brackets.
4. The relation should make sense.
5. The output should contain at most 8 tuples

For example:

Input test: "In the image, the woman sits on the bed near a pillow".
Output: {[["woman", "on", "bed"],
        ["woman", "near", "pillow"]]}

Now, generate tuples of relation for me with this input text:

"In the image, the person is sitting on a chair in front of a table, which has a laptop on it. The table is located next to a bed, and there is a bottle on the table as well. A sofa is also present in the room, and a television is positioned nearby. A phone is placed on the table, and a blanket and clothes are located on the bed. The person is wearing a blue shirt, and there is a computer mouse on the table."

In [17]:
llama2_return = '''Sure! Here are the tuples of relations for the given input text:

{
[["person", "sitting", "chair"],
["table", "in_front_of", "person"],
["laptop", "on", "table"],
["table", "next_to", "bed"],
["bed", "near", "sofa"],
["television", "positioned", "nearby"],
["phone", "on", "table"],
["blanket", "on", "bed"],
["clothes", "on", "bed"],
["person", "wearing", "blue_shirt"],
["computer_mouse", "on", "table"]
]
}

Note that I've used the following relations from your array:

"sitting" instead of "on" to indicate that the person is sitting on the chair
"in_front_of" instead of "on" to indicate that the table is in front of the person
"next_to" instead of "on" to indicate that the table is located next to the bed
"'''

In [20]:
jsonstart = llama2_return.index('{')
jsonend = llama2_return.index('}')
jsonstr = llama2_return[jsonstart: jsonend].strip("{}")
jsonstr

'\n[["person", "sitting", "chair"],\n["table", "in_front_of", "person"],\n["laptop", "on", "table"],\n["table", "next_to", "bed"],\n["bed", "near", "sofa"],\n["television", "positioned", "nearby"],\n["phone", "on", "table"],\n["blanket", "on", "bed"],\n["clothes", "on", "bed"],\n["person", "wearing", "blue_shirt"],\n["computer_mouse", "on", "table"]\n]\n'

In [21]:
res = json.loads(jsonstr)

In [22]:
res

[['person', 'sitting', 'chair'],
 ['table', 'in_front_of', 'person'],
 ['laptop', 'on', 'table'],
 ['table', 'next_to', 'bed'],
 ['bed', 'near', 'sofa'],
 ['television', 'positioned', 'nearby'],
 ['phone', 'on', 'table'],
 ['blanket', 'on', 'bed'],
 ['clothes', 'on', 'bed'],
 ['person', 'wearing', 'blue_shirt'],
 ['computer_mouse', 'on', 'table']]