In [106]:
import dill
import json
from string import punctuation
from re import search
import os
import pandas as pd
import altair as alt

In [107]:
BASELINE_TRAJECTORY_PATH = 'trajectories_gemini'
N_FILES = 60

In [108]:
envs_bagel = {'click-checkboxes-soft':10,
 'click-tab-2-hard':20,
 'social-media':15,
 'email-inbox':30,
 'social-media-some':30,
 'tic-tac-toe':10,
 'use-autocomplete':10,
 'book-flight':30,
 'choose-date':20,
 'search-engine':20}

### Calculate Custom Score and Reward Average
Note: Due to rate-limited slow inference on gemini, reward average is calculated by awarding 1 every time there is a reward, regardless of the value

In [109]:
def find_ref(target, trajectory):
    for state in trajectory['states']:
        for element in state['dom_elements']:
            ref = element['ref']
            text = element['text']
            if text == target:
                return ref
    return 0

def action_found(ref, trajectory):
    for state in trajectory['states']:
        if 'CLICK_ELEMENT' in state['action']:
            action_ref = int(search('CLICK_ELEMENT ([0-9]*)',state['action']).group(1))
            if action_ref ==  ref:
                return True
    return False

def eval_click_tab_2_hard(trajectory):
    target = search('\"(.*)\"', trajectory['utterance']).group(1)#.strip(punctuation).strip()
    ref = find_ref(target, trajectory)
    if action_found(ref,trajectory):
        return 1
    return 0

def eval_click_checkboxes_soft(trajectory):
    #tbi
    return 0

def eval_social_media(trajectory):
    #tbi
    return 0

def eval_email_inbox(trajectory):
    #tbi
    return 0

def eval_social_media_some(trajectory):
    #tbi
    return 0

def eval_tic_tac_toe(trajectory):
    #tbi
    return 0

def eval_use_autocomplete(trajectory):
    #tbi
    return 0

def eval_book_flight(trajectory):
    click_elems_fixed =[7,7,20,9,9,24,13,15]
    click_elems_choose = range(55,118)
    type_elems = [7,9]
    actions, refs = [], []
    for state in trajectory['states'][1:]:
        ref = search(' ([0-9]+)',state['action']).group(1)
        refs.append(int(ref))
        actions.append(search('([A-Z_]) ' + ref, state['action']))
        actions_refs = zip(actions,refs)
        clicks = [x for x in actions_refs if x[0] == 'CLICK_ELEMENT']
        types = [x for x in actions_refs if x[0] == 'TYPE_TEXT']
    missed_clicks = max(0, len(click_elems_fixed)+1 - len(clicks))
    missed_types = max(0, len(type_elems) - len(types))
    total_actions = len(click_elems_fixed)+2 + len(type_elems) #2 adding the final price button
    score = (total_actions - missed_clicks - missed_types)/total_actions
    return score

def eval_choose_date(trajectory):
    #tbi
    return 0

In [110]:
scoring_function = {
    'click-checkboxes-soft': eval_click_checkboxes_soft,
    'click-tab-2-hard': eval_click_tab_2_hard,
    'social-media': eval_social_media,
    'email-inbox': eval_email_inbox,
    'social-media-some': eval_social_media_some,
    'tic-tac-toe': eval_tic_tac_toe,
    'use-autocomplete': eval_use_autocomplete,
    'book-flight': eval_book_flight,
    'choose-date': eval_choose_date,
}


In [118]:
def score_env(env_name,scoring_function):
    path = BASELINE_TRAJECTORY_PATH+'/'+env_name
    files = os.listdir(path)
    files = [x for x in files if os.path.isfile(os.path.join(path,x))]
    score = 0
    reward = 0
    n_rewards = 0
    for i in range(N_FILES):
        filepath = os.path.join(path, files[i])
        trajectory = dill.load(open(filepath, 'rb'))
        score += scoring_function(trajectory)
        traj_reward = trajectory['reward']
        if traj_reward is not None:
            n_rewards +=1
            reward += (traj_reward+1)/2
            
    return score/N_FILES, reward/n_rewards


def get_reward_df(d):
    scores, rewards = [], []
    for task,function in d.items():
        score, reward = score_env(task, function)
        scores.append(score)
        rewards.append(reward*100)
    df = pd.DataFrame()
    df['Task'] = d.keys()
    df['Score'] = scores
    df['Average Reward'] = rewards
    return df

df_scores = get_reward_df(scoring_function)
df_scores

Unnamed: 0,Task,Score,Average Reward
0,click-checkboxes-soft,0.0,52.143509
1,click-tab-2-hard,0.316667,41.139709
2,social-media,0.0,59.364056
3,email-inbox,0.0,64.893561
4,social-media-some,0.0,32.171208
5,tic-tac-toe,0.0,44.2535
6,use-autocomplete,0.0,15.833333
7,book-flight,0.083333,39.166667
8,choose-date,0.0,39.830508


In [117]:
alt.Chart(df_scores,title='Baseline Average Reward for MiniWob++ Tasks').mark_bar().encode(x='Average Reward',y='Task')

In [119]:
alt.Chart(df_scores[df_scores.Score>0],title='Baseline Score for Selected MiniWob++ Tasks').mark_bar(
).encode(x='Score',y='Task')

### Visual Inspection

In [79]:
env = list(envs_bagel.keys())[0]
env

'click-checkboxes-soft'

In [87]:
path = BASELINE_TRAJECTORY_PATH + '/' + env + '/' + env + '35.pkd'
trajectory = dill.load(open(path,'rb'))
trajectory

{'utterance': 'Select words similar to peculiar, gleeful, delicious, reply, hate and click Submit.',
 'reward': 0.0,
 'states': [{'time': 0,
   'action': '',
   'dom_elements': ({'ref': 1,
     'parent': 0,
     'left': array([0.], dtype=float32),
     'top': array([0.], dtype=float32),
     'width': array([500.], dtype=float32),
     'height': array([210.], dtype=float32),
     'tag': 'body',
     'text': '',
     'value': '',
     'id': '',
     'classes': '',
     'bg_color': array([0.33333334, 0.33333334, 0.33333334, 1.        ], dtype=float32),
     'fg_color': array([0., 0., 0., 1.], dtype=float32),
     'flags': array([1, 0, 0, 0], dtype=int8)},
    {'ref': 2,
     'parent': 1,
     'left': array([0.], dtype=float32),
     'top': array([0.], dtype=float32),
     'width': array([160.], dtype=float32),
     'height': array([210.], dtype=float32),
     'tag': 'div',
     'text': '',
     'value': '',
     'id': 'wrap',
     'classes': '',
     'bg_color': array([1., 1., 1., 1.], dt

In [88]:
[x['action'] for x in trajectory['states']]

['',
 'Click the checkbox next to the word unusual - CLICK_ELEMENT 14',
 'Click the checkbox next to the word cheerful - CLICK_ELEMENT 8',
 'Click the checkbox next to the word despise - CLICK_ELEMENT 12']

In [74]:
trajectory['states'][6]['dom_elements']

({'ref': 1,
  'parent': 0,
  'left': array([0.], dtype=float32),
  'top': array([0.], dtype=float32),
  'width': array([500.], dtype=float32),
  'height': array([210.], dtype=float32),
  'tag': 'body',
  'text': '',
  'value': '',
  'id': '',
  'classes': '',
  'bg_color': array([0.33333334, 0.33333334, 0.33333334, 1.        ], dtype=float32),
  'fg_color': array([0., 0., 0., 1.], dtype=float32),
  'flags': array([0, 0, 0, 0], dtype=int8)},
 {'ref': 2,
  'parent': 1,
  'left': array([0.], dtype=float32),
  'top': array([0.], dtype=float32),
  'width': array([160.], dtype=float32),
  'height': array([210.], dtype=float32),
  'tag': 'div',
  'text': '',
  'value': '',
  'id': 'wrap',
  'classes': '',
  'bg_color': array([1., 1., 1., 1.], dtype=float32),
  'fg_color': array([0., 0., 0., 1.], dtype=float32),
  'flags': array([0, 0, 0, 0], dtype=int8)},
 {'ref': 3,
  'parent': 2,
  'left': array([0.], dtype=float32),
  'top': array([50.], dtype=float32),
  'width': array([160.], dtype=float

### Manual trajectories - Visual Inspection

In [41]:
path = MANUAL_TRAJECTORY_PATH + '/' + env + '.json'
path

'sample_trajectories/use-autocomplete.json'

In [42]:
with open(path,'r') as f:

    manual_trajectory = json.load(f)

In [43]:
[x['action'] for x in manual_trajectory['states']]

[None,
 {'type': 'mousedown', 'x': 50, 'y': 75, 'timing': 1},
 {'type': 'mousedown', 'x': 50, 'y': 75, 'timing': 3},
 {'type': 'mouseup', 'x': 50, 'y': 76, 'timing': 1},
 {'type': 'mouseup', 'x': 50, 'y': 76, 'timing': 3},
 {'type': 'click', 'x': 50, 'y': 76, 'timing': 1},
 {'type': 'click', 'x': 50, 'y': 76, 'timing': 3},
 {'type': 'keydown', 'keyCode': 16, 'charCode': 0, 'timing': 1},
 {'type': 'keydown', 'keyCode': 16, 'charCode': 0, 'timing': 3},
 {'type': 'keydown', 'keyCode': 84, 'charCode': 0, 'timing': 1},
 {'type': 'keydown', 'keyCode': 84, 'charCode': 0, 'timing': 3},
 {'type': 'keypress', 'keyCode': 84, 'charCode': 84, 'timing': 1},
 {'type': 'keypress', 'keyCode': 84, 'charCode': 84, 'timing': 3},
 {'type': 'keyup', 'keyCode': 16, 'charCode': 0, 'timing': 1},
 {'type': 'keyup', 'keyCode': 16, 'charCode': 0, 'timing': 3},
 {'type': 'keyup', 'keyCode': 84, 'charCode': 0, 'timing': 1},
 {'type': 'keyup', 'keyCode': 84, 'charCode': 0, 'timing': 3},
 {'type': 'keydown', 'keyCode