In [1]:
import os
import sys

device = "cuda:3"

# Setup: and Selenium chrome driver to PATH and set MINIWOB_BASE_URL env variable to the directory with HTML task files

In [2]:
from miniwob.environment import MiniWoBEnvironment
from miniwob.screenshot import pil_to_numpy_array

task_name = 'login-user'
base_url = os.environ.get('MINIWOB_BASE_URL')
print('BASE URL:', base_url)

BASE URL: file:///mnt/akostin/home/akostin/prjs/miniwob-plusplus/html/


In [3]:
X_MAX, Y_MAX = 160, 210
DOM_TEXT_MAX_TOKENS = 8
GOAL_MAX_TOKENS = 32
GOAL_MAX_WORDS = 25
DOM_MAX_EDGES = 20
DOM_MAX_VERTICES = 20
EXPLORE_REWARD = 0.1

username_idx = 4
passwd_idx = 10
username_element_id = 1
password_element_id = 4

In [4]:
from miniwob.action import  MiniWoBElementClick, MiniWoBType
import numpy as np

env = MiniWoBEnvironment(task_name, seeds=[1], num_instances=1, base_url=base_url, headless=True)
state = env.reset(record_screenshots=True, seeds=[11])[0]
print(len(state.tokens))
tags = set()
for elem in state.dom_elements:
    tags.add(elem.tag)
tag2id = {t: i for i, t in enumerate(tags)}

env.close()

20




# Gym miniwob wrapper

In [5]:
import gym
import torch
from miniwob.action import MiniWoBElementClick, MiniWoBType
from transformers import BertTokenizer
import numpy as np


class MiniWobClickElementWrapper(gym.Wrapper):
    
    def __init__(self, env):
        super().__init__(env)
        self.env = env
        bert_model = 'bert-base-cased'
        
        self.tokenizer = BertTokenizer.from_pretrained(bert_model)
        self.observation_space = gym.spaces.Dict({"goal_state": gym.spaces.Box(0, 
                                                                               self.tokenizer.vocab_size,
                                                                               (GOAL_MAX_TOKENS, ),
                                                                               dtype=np.int64),
                                                  "img_state": gym.spaces.Box(0,
                                                                              255,
                                                                              (150, 150, 3),
                                                                              dtype=np.uint8),
                                                  "dom_simple_feats": gym.spaces.Box(-1,
                                                                                     1,
                                                                                     (DOM_MAX_VERTICES, 3),
                                                                                     dtype=np.float32),
                                                  "dom_text_tokens": gym.spaces.Box(0,
                                                                                    self.tokenizer.vocab_size,
                                                                                    (DOM_MAX_VERTICES, DOM_TEXT_MAX_TOKENS),
                                                                                    dtype=np.int64),
                                                  "dom_tags": gym.spaces.Box(0,
                                                                             len(tags)-1,
                                                                             (DOM_MAX_VERTICES,),
                                                                             dtype=np.int64),
                                                  "n_goal_words": gym.spaces.Box(0,
                                                                                 GOAL_MAX_WORDS,
                                                                                 (1,),
                                                                                 dtype=np.int64),
                                                  "action_mask": gym.spaces.Box(0,
                                                                                1,
                                                                                (DOM_MAX_VERTICES + GOAL_MAX_WORDS,),
                                                                                dtype=np.int64)})
        
        self.past_states = [set() for _ in range(self.num_instances)]
        self.curr_states = [[0 for _ in range(DOM_MAX_VERTICES*2)] for _ in range(self.num_instances)]
        self.inst_verts = [None for _ in range(self.num_instances)]
        self.text_tokens = [None for _ in range(self.num_instances)]
        self.dom_keys = [key for key in self.observation_space if key.startswith("dom")] + \
                        ["n_goal_words"] + ["action_mask"]
        self.action_space = gym.spaces.Discrete(DOM_MAX_VERTICES + GOAL_MAX_WORDS)
    
    def _ob_to_dom(self, ob, i):
        max_length = DOM_TEXT_MAX_TOKENS
        if ob is None:
            self.inst_verts[i] = None
            self.text_tokens[i] = None
            return {
                "dom_simple_feats": -torch.ones((1, DOM_MAX_VERTICES, 3), dtype=torch.float32),
                "dom_text_tokens": torch.zeros((1, DOM_MAX_VERTICES, DOM_TEXT_MAX_TOKENS), dtype=torch.int64),
                "dom_tags": torch.zeros((1, DOM_MAX_VERTICES), dtype=torch.int64),
                "n_goal_words": torch.zeros((1,1), dtype=torch.int64),
                "action_mask": torch.ones((1, DOM_MAX_VERTICES + GOAL_MAX_WORDS), dtype=torch.int64)
            }
        else:
            self.inst_verts[i] = ob.dom_elements
            self.text_tokens[i] = ob.tokens
            tokens = []
            feats = []
            tags_ = []
            for elem in ob.dom_elements:
                elem_text = elem.text if elem.text else ""
                elem_tokens = self.tokenizer(elem_text,
                                             padding='max_length',
                                             max_length=DOM_TEXT_MAX_TOKENS,
                                             truncation=True,
                                             return_tensors="pt")["input_ids"]
                tokens.append(elem_tokens)
                elem_feats = torch.tensor([elem.focused, bool(elem.value), elem.tampered],
                                          dtype=torch.float32)
                feats.append(elem_feats)
                tags_.append(torch.tensor([tag2id[elem.tag]], dtype=torch.int64))
            
            delta_v = DOM_MAX_VERTICES - len(tokens)
            available_dom_click = [0 for _ in range(len(tokens))] + [1 for _ in range(delta_v)]
            
            if self.typeble_elem_focused(i) and not self.elem_have_value(i):
            #if not self.elem_have_value(i):
                available_type_tokens = [0 for _ in range(len(self.text_tokens[i]))] + \
                                        [1 for _ in range(GOAL_MAX_WORDS - len(self.text_tokens[i]))]
            else:
                available_type_tokens = [1 for _ in range(GOAL_MAX_WORDS)]
            available_actions = available_dom_click + available_type_tokens
            
            tokens += [torch.zeros((1, DOM_TEXT_MAX_TOKENS), dtype=torch.int64) for _ in range(delta_v)]
            feats += [-torch.ones((3,), dtype=torch.float32) for _ in range(delta_v)]
            tags_ += [torch.zeros((1,), dtype=torch.int64) for _ in range(delta_v)]
            
            return{
                "dom_text_tokens": torch.cat(tokens, dim=0).unsqueeze(0),
                "dom_simple_feats": torch.stack(feats, dim=0).unsqueeze(0),
                "dom_tags": torch.cat(tags_, dim=0).unsqueeze(0),
                "n_goal_words": torch.tensor([len(self.text_tokens[i])]).unsqueeze(0),
                "action_mask": torch.tensor(available_actions).unsqueeze(0)
            }
    
    def typeble_elem_focused(self, i):
        focused_elem = self.n_element_focused(i)
        if focused_elem in [username_element_id, password_element_id]:
            return True
        else:
            return False
    
    def elem_have_value(self, i):
        focused_elem = self.n_element_focused(i)
        if not focused_elem is None:
            s = self.curr_states[i]
            value = [int(s[i]) for i in range(1, len(s), 2)]
            return bool(value[focused_elem])
        else:
            return False
    
    def n_element_focused(self, i):
        s = self.curr_states[i]
        focus = [int(s[i]) for i in range(0, len(s), 2)]
        if 1 in focus:
            return focus.index(1)
        else:
            return None
    
    def _ob_to_token(self, ob, i):
        max_length = GOAL_MAX_TOKENS
        if ob is None:
            return torch.zeros(1, max_length)
        else: 
            return self.tokenizer(ob.tokens,  
                                  padding='max_length', 
                                  max_length = max_length, truncation=True,          
                                  return_tensors="pt",
                                  is_split_into_words=True)['input_ids']
            
    def _ob_to_image(self, ob):
        if ob is None:
            return torch.zeros(1, 150, 150, 3)
        else:
            return torch.tensor(pil_to_numpy_array(ob.screenshot.resize([150, 150]))).unsqueeze(0)
        
    
    def _to_miniwob_actions(self, actions):
        
        actions = actions.squeeze()
        miniwob_actions = []
        
        for i in range(self.num_instances):
            if self.instances[i].get_metadata()['done']:
                miniwob_actions.append(None)
            else:
                if self.inst_verts[i] is None:
                    miniwob_actions.append(None)
                else:
                    v_n = actions[i].item()
                    if v_n < DOM_MAX_VERTICES:
                        element = self.inst_verts[i][v_n]
                        miniwob_actions.append(MiniWoBElementClick(element))
                    else:
                        text = self.text_tokens[i][v_n - DOM_MAX_VERTICES]
                        miniwob_actions.append(MiniWoBType(text))
        return miniwob_actions
    
    def get_explore_reward(self, obs):
        explore_rewards = [0 for _ in range(self.num_instances)]
        for i, states in enumerate(self.past_states):
            t = "".join([str(int(x)) for x in obs["dom_simple_feats"][i][:, :-1].flatten()])
            t = t.split('-')[0]
            if t in states:
                explore_rewards[i] -= EXPLORE_REWARD
            else:
                explore_rewards[i] += EXPLORE_REWARD
        return explore_rewards

    def observation(self, obs):
        goals = torch.cat([self._ob_to_token(obs[i], i) for i in range(len(obs))], dim=0)
        imgs = torch.cat([self._ob_to_image(ob) for ob in obs], dim=0)
        doms = [self._ob_to_dom(obs[i], i) for i in range(len(obs))]
        doms_states = {key: torch.cat([dom[key] for dom in doms], dim=0) for key in self.dom_keys}
        return {'img_state': imgs, 'goal_state': goals, **doms_states}
    
    def update_past_states(self, obs):
        for i, states in enumerate(self.past_states):
            s = "".join([str(int(x)) for x in obs["dom_simple_feats"][i][:, :-1].flatten()])
            s = s.split('-')[0]
            self.past_states[i].add(s)
            self.curr_states[i] = s
    
    def reset(self):
        obs = self.observation(self.env.reset(record_screenshots=True))
        self.update_past_states(obs)
        return obs
    
    def step(self, actions):
        miniwob_actions = self._to_miniwob_actions(actions)
        obs, rewards, dones, infos = self.env.step(miniwob_actions)
        obs = self.observation(obs)
        ####
        explore_rewards = self.get_explore_reward(obs)
        rewards = [a + b for a, b in zip(rewards, explore_rewards)]
        infos['n'][0]['exploration_rewards'] = explore_rewards
        ####
        self.update_past_states(obs)
        
        for i, instance in enumerate(self.env.instances):
            if instance.get_metadata()['done']:
                instance.begin_task()
                self.past_states[i] = set()
        
        return obs, torch.tensor([rewards]).T, torch.tensor([dones]).T, infos['n']


class EpisodeInfoWrapper(gym.Wrapper):
    
    def __init__(self, env, n):
        super(EpisodeInfoWrapper, self).__init__(env)
        self.episode_reward = np.zeros(n)
        self.episode_steps = np.zeros(n)
        self.n = n

    def reset(self):
        self.episode_reward = np.zeros(self.n)
        self.episode_steps = np.zeros(self.n)
        return self.env.reset()

    def step(self, actions):
        states, rewards, dones, infos = self.env.step(actions)
        
        mean_exploration_reward = np.mean(infos[0]["exploration_rewards"])
        infos[0]["mean_exploration_reward"] = mean_exploration_reward
        for i, done in enumerate(dones):
            self.episode_steps[i] += 1
            if done:
                self.episode_reward[i] = rewards[i]
                infos[i]['episode'] = {'r': self.episode_reward[i], 'steps': self.episode_steps[i]}
                self.episode_steps[i] = 0
                
        return states, rewards, dones, infos

In [6]:
from transformers import BertTokenizer, BertModel, BertConfig
import torch
from torch import nn
from torch.nn import Linear
from torch.nn.functional import relu


class BertEncoder(nn.Module):
    
    def __init__(self):
        
        super().__init__()
        bert_model = 'bert-base-cased'
        
        self.bert_encoder = BertModel.from_pretrained(bert_model)
        self.embed_dim = 768
        self.output_size = 64
        #self.out_layer = nn.Linear(self.embed_dim, self.output_size)
        self.out_layer = nn.AdaptiveAvgPool1d(output_size=self.output_size)
        tokenizer = BertTokenizer.from_pretrained(bert_model)
        self.pad_token = tokenizer.pad_token_id
   
    def forward(self, input_ids):
        """
        Encode batch of tokens
        
        """
        
        with torch.no_grad():
            text_tokens_embed = self.bert_encoder(input_ids.long())[0]
            out = self.out_layer(text_tokens_embed)
            #out = relu(self.out_layer(text_tokens_embed))
        
        pad_mask = torch.where(input_ids.long() == self.pad_token, True, False)
        return out, pad_mask


class DOMBert(nn.Module):
    
    def __init__(self):
        
        super().__init__()
        bert_model = 'bert-base-cased'
        
        self.bert_encoder = BertModel.from_pretrained(bert_model)
        self.embed_dim = 768
        self.output_size = 64 - 7
        #self.out_layer = nn.Linear(self.embed_dim, self.output_size)
        self.out_layer = nn.AdaptiveAvgPool1d(output_size=self.output_size)
    
    def forward(self, input_ids):
        """
        Encode batch of tokens
        
        """
        with torch.no_grad():
            text_tokens_embed = self.bert_encoder(input_ids.long())[1]
            out = self.out_layer(text_tokens_embed)
        #out = relu(self.out_layer(text_tokens_embed))
        
        return out

In [7]:
import torch
from torch import nn
from torch.nn import ModuleList, Parameter, TransformerEncoderLayer, TransformerEncoder
from torch.nn.functional import relu, softmax


UNDIRECTED = True
SELF_LOOP = True
bert_model = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(bert_model)
pad_token_id = tokenizer.pad_token_id


class DOMEncoder(nn.Module):
    def __init__(self):
        super(DOMEncoder, self).__init__()
        self.inp_dim = 64
        self.output_size = 64
        self.h_dim = 64
        self.n_h_layers = 1
        
        self.layers = ModuleList([nn.Linear(self.inp_dim, self.h_dim)] + \
                                 [nn.Linear(self.h_dim, self.h_dim) for _ in range(self.n_h_layers)] + \
                                 [nn.Linear(self.h_dim, self.output_size)])
        
        self.bn = nn.ModuleList([nn.BatchNorm1d(self.inp_dim)] + \
                                [nn.BatchNorm1d(self.h_dim) for _ in range(self.n_h_layers + 1)])
        
        self.text_encoder = DOMBert()
        self.tag_vectorizer = nn.Embedding(len(tags), 4)
    
    def forward(self, input):
        mask = input["dom_simple_feats"][:, :, 0] # B x MAX_VERTS
        mask = torch.where(mask==-1, True, False)
        
        tokens = input["dom_text_tokens"]
        bs = tokens.shape[0]
        tokens = torch.flatten(tokens, end_dim=-2) # B*MAX_VERTS x MAX_TOKENS
        text_feats = self.text_encoder(tokens)
        
        tag_feats = self.tag_vectorizer(input["dom_tags"].long())
        tag_feats = torch.flatten(tag_feats, end_dim=-2)   # B*MAX_VERTS x DIM_t
        
        feats = torch.flatten(input["dom_simple_feats"], end_dim=-2) # B*MAX_VERTS x DIM_f
        feats = torch.cat([text_feats, tag_feats, feats], dim=-1)
        
        out = feats
        prev = feats
        for l, b in zip(self.layers, self.bn):
            out = b(out)
            out = l(out)
            out = F.relu(out)
            out = out + prev
            prev = out
        out = out.view(bs, DOM_MAX_VERTICES, -1).permute([1, 0, 2]) # MAX_VERTS x B x DIM
        return out, mask


class DOMGoalTransformer(nn.Module):
    def __init__(self):
        super(DOMGoalTransformer, self).__init__()
        self.inp_dim = 64
        self.output_size = self.inp_dim
        
        self.embed_token = nn.Parameter(torch.empty(1, 1, self.inp_dim))
        nn.init.uniform_(self.embed_token, -0.2, 0.2)
        
        encoder_layer = TransformerEncoderLayer(64, 4, dim_feedforward=64, norm_first=True)
        self.encoder = TransformerEncoder(encoder_layer, 4)
    
    def forward(self, inp, mask):
        mask = torch.cat([torch.zeros_like(mask[:, :1], dtype=torch.bool), mask], dim=1)
        inp = torch.cat([self.embed_token.repeat_interleave(inp.shape[1], dim=1), inp], dim=0)
        out = self.encoder(inp, src_key_padding_mask=mask)
        return out[0]


class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.dom = DOMEncoder()
        self.goal = BertEncoder()
        self.tf = DOMGoalTransformer()
        self.output_size = 64
    
    def forward(self, input):
        dom_embeds, dom_mask = self.dom(input)
        goal_embeds, goal_mask = self.goal(input["goal_state"])
        dom_goal_embeds = torch.cat([dom_embeds, goal_embeds.permute([1, 0, 2])], dim=0)
        dom_goal_mask = torch.cat([dom_mask, goal_mask], dim=1)
        out = self.tf(dom_goal_embeds, dom_goal_mask)
        return out

In [8]:
from rllr.models.ppo import FixedCategorical, ActorCriticNetwork, CriticNetwork
from torch.nn import functional as F


class DiscreteActorNetwork(nn.Module):
    """
    Actor is a policy network. Given state it evaluates
    probability of action given state or sample an action
    """

    def __init__(self, action_size, state_encoder, hidden_size):
        super().__init__()
        self.state_encoder = state_encoder
        input_size = state_encoder.output_size
        self.h_dim = hidden_size
        self.inp_dim = input_size
        self.output_size = action_size
        self.n_h_layers = 1
        
        ####
        dom_mask = torch.zeros((1, DOM_MAX_VERTICES), dtype=torch.bool)
        words_mask = torch.ones((1, GOAL_MAX_WORDS), dtype=torch.bool)
        words_mask[0, username_idx] = False; words_mask[0, passwd_idx] = False
        words_mask = torch.cat([dom_mask, words_mask], dim=1)
        self.register_buffer("words_mask", words_mask)
        ####
        
        self.layers = ModuleList([nn.Linear(self.inp_dim, self.h_dim)] + \
                                 [nn.Linear(self.h_dim, self.h_dim) for _ in range(self.n_h_layers)] +                                  [nn.Linear(self.h_dim, self.output_size)])
        
        self.bn = nn.ModuleList([nn.BatchNorm1d(self.inp_dim)] + \
                                [nn.BatchNorm1d(self.h_dim) for _ in range(self.n_h_layers+1)])

    def forward(self, states):
        states_encoding = self.state_encoder(states)
        action_mask = states["action_mask"]  # BxA
        
        ####
        #bs = action_mask.shape[0]
        #words_mask = torch.tile(self.words_mask, [bs, 1])
        #action_mask = torch.logical_or(action_mask, words_mask)
        ####
        
        for i, m in enumerate(action_mask):
            if m.all():
                action_mask[i] = torch.zeros_like(m, dtype=torch.bool)
        
        #logits = self.logits(states_encoding)
        out = states_encoding
        prev = states_encoding
        for l, b in zip(self.layers[:-1], self.bn[:-1]):
            out = b(out)
            out = l(out)
            out = F.relu(out)
            out = out + prev
            prev = out
        out = self.bn[-1](out)
        logits = self.layers[-1](out)
        
        logits = torch.where(action_mask==1, -float("inf"), logits.double())
        return FixedCategorical(logits=F.log_softmax(logits, dim=1))


class ACN(ActorCriticNetwork):
    def __init__(self, action_space, actor_state_encoder, critic_state_encoder, actor_hidden_size, critic_hidden_size):
        super(ActorCriticNetwork, self).__init__()
        if type(action_space) == gym.spaces.Discrete:
            self.actor = DiscreteActorNetwork(action_space.n, actor_state_encoder, actor_hidden_size)
        else:
            raise f'{action_space} not supported'
        self.critic = CriticNetwork(critic_state_encoder, critic_hidden_size)

        def init_params(m):
            classname = m.__class__.__name__
            if classname.find("Linear") != -1:
                m.weight.data.normal_(0, 1)
                m.weight.data *= 1 / torch.sqrt(m.weight.data.pow(2).sum(1, keepdim=True))
                if m.bias is not None:
                    m.bias.data.fill_(0)

        self.apply(init_params)

# Train agent

In [9]:
import numpy as np
from rllr.env.vec_wrappers import make_vec_envs
from stable_baselines3.common.vec_env import DummyVecEnv


def get_envs(n, **kwargs):
    env = MiniWoBEnvironment(task_name, seeds=range(n), num_instances=n, base_url=base_url, **kwargs)
    env = MiniWobClickElementWrapper(env)
    return EpisodeInfoWrapper(env, n)

In [10]:
n_instances = 4
envs = get_envs(n_instances, headless=True)

In [11]:
from rllr.models.encoders import GoalStateEncoder, SimpleCNN

encoder = Encoder()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predicti

In [12]:
from rllr.algo import PPO
from rllr.models.ppo import ActorCriticNetwork

hidden_size = 64
policy = ACN(envs.action_space, encoder, encoder, hidden_size, hidden_size)

agent_conf = {
        "clip_param": 0.2,
        "ppo_epoch": 4,
        "num_mini_batch": 4,
        "value_loss_coef": 0.5,
        "entropy_coef": 0.01,
        "lr": 0.001,
        "eps": 1e-5,
        "max_grad_norm": 0.5,
}


agent = PPO(policy, **agent_conf)
agent.to(device)

<rllr.algo.ppo.PPO at 0x7f6868472ca0>

In [13]:
from tqdm import trange
import time
from collections import deque
import torch
import numpy as np

from rllr.buffer.rollout import RolloutStorage


def update_linear_schedule(optimizer, epoch, total_num_epochs, initial_lr):
    """Decreases the learning rate linearly"""
    lr = initial_lr - (initial_lr * (epoch / float(total_num_epochs)))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def train_ppo(env, agent, conf):
    """
    Runs a series of episode and collect statistics
    """
    rollouts = RolloutStorage(
        conf['training.n_steps'], conf['training.n_processes'], env.observation_space, env.action_space
    )
    obs = env.reset()
    rollouts.set_first_obs(obs)
    rollouts.to(conf['agent.device'])

    start = time.time()
    num_updates = int(conf['training.n_env_steps'] // conf['training.n_steps'] // conf['training.n_processes'])

    episode_rewards = deque(maxlen=20)
    exploration_rewards = deque(maxlen=100)
    episode_steps = deque(maxlen=20)

    for j in trange(num_updates):
        update_linear_schedule(agent.optimizer, j, num_updates, conf['agent.lr'])

        for step in range(conf['training.n_steps']):
            # Sample actions
            obs = {k: v.to(conf['agent.device']) for k, v in obs.items()}
            value, action, action_log_prob = agent.act(obs)
            obs, reward, done, infos = env.step(action)
            
            exploration_rewards.append(infos[0]["mean_exploration_reward"])
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    episode_steps.append(info['episode']['steps'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            rollouts.insert(obs, action, action_log_prob, value, reward, masks)

        next_value = agent.get_value(rollouts.get_last_obs())
        rollouts.compute_returns(next_value, conf['agent.gamma'], conf['agent.gae_lambda'])

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        if j % conf['training.verbose'] == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * conf['training.n_processes'] * conf['training.n_steps']
            end = time.time()
            print(f'Updates {j}, '
                  f'num timesteps {total_num_steps}, '
                  f'FPS {int(total_num_steps / (end - start))} \n'
                  f'Last {len(episode_rewards)} training episodes: '
                  f'mean/median reward {np.mean(episode_rewards):.2f}/{np.median(episode_rewards):.2f}, '
                  f'min/max reward {np.min(episode_rewards):.2f}/{np.max(episode_rewards):.2f}\n'
                  f'dist_entropy {dist_entropy:.2f}, '
                  f'value_loss {value_loss:.2f}, '
                  f'action_loss {action_loss:.2f}, '
                  f'explor_rew {np.mean(exploration_rewards):.6f} '
                  f'mean_episode_steps {np.mean(episode_steps):.2f}')
            with open(conf['outputs.path']+".txt", 'a') as f:
                f.write(f"%d;%.4f;%.4f;%.6f;%.4f;%.4f\n" % \
                        (total_num_steps, np.mean(episode_rewards), \
                        np.median(episode_rewards), np.mean(exploration_rewards), \
                        dist_entropy, np.mean(episode_steps)))
            
        if j % 50 == 0:
            torch.save(agent, conf['outputs.path']+"_"+str(j))

In [14]:
train_conf = {
    "agent.lr": 0.001,
    "agent.device": device,
    "agent.gamma": 0.99,
    "agent.gae_lambda": 0.95,
    "training.n_env_steps": 400000,
    "training.n_steps": 100,
    "training.n_processes": n_instances,
    "training.verbose": 1,
    "outputs.path": "./miniwob_login"
}

In [15]:
#train_ppo(envs, agent, train_conf)

In [16]:
envs.close()



In [42]:
n_instances = 4
envs = get_envs(n_instances, headless=True)

In [43]:
from tqdm import trange
import time
from collections import deque
import numpy as np
import torch


def validate_ppo(env, conf):
    agent = torch.load(conf['model_path'])
    agent.to(conf['device'])
    obs = env.reset()

    start = time.time()
    episode_rewards = deque(maxlen=conf['n_episodes'])
    exploration_rewards = deque(maxlen=100)
    episode_steps = deque(maxlen=conf['n_episodes'])

    while len(episode_rewards) < conf['n_episodes']:
        obs = {k: v.to(conf['device']) for k, v in obs.items()}
        value, action, action_log_prob = agent.act(obs, deterministic=False)
        obs, reward, done, infos = env.step(action)
            
        exploration_rewards.append(infos[0]["mean_exploration_reward"])
        for info in infos:
            if 'episode' in info.keys():
                episode_rewards.append(info['episode']['r'])
                episode_steps.append(info['episode']['steps'])
        
    print(f"mean reward: %.2f\tmedian reward: %.2f\tmax reward: %.2f\tmin reward: %.2f\tmean steps per episode: %.2f" % \
          (np.mean(episode_rewards), np.median(episode_rewards), np.max(episode_rewards), np.min(episode_rewards), np.mean(episode_steps)))

In [50]:
conf = {
    "model_path": "explore_solution_with_typeble_masking/miniwob_login_1600",
    "n_episodes": 100,
    "device": device,
}

In [3]:
validate_ppo(envs, conf)

In [52]:
envs.close()

