In [139]:
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

plt.style.use('ggplot')

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [142]:
import base64
import glob
import io
from IPython.display import HTML
from IPython import display 

def show_video():
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")

# Flatten image and remove walls

In [5]:
import gym
from gym import spaces
from gym_minigrid.minigrid import OBJECT_TO_IDX, COLOR_TO_IDX

# max_env_steps = 100

class FlatObsWrapper(gym.core.ObservationWrapper):
    """Fully observable gridworld returning a flat grid encoding."""

    def __init__(self, env):
        super().__init__(env)

        # Since the outer walls are always present, we remove left, right, top, bottom walls
        # from the observation space of the agent. There are 3 channels, but for simplicity
        # in this assignment, we will deal with flattened version of state.
        
        self.observation_space = spaces.Box(
            low=0,
            high=255,
            shape=((self.env.width-2) * (self.env.height-2) * 3,),  # number of cells
            dtype='uint8'
        )
        # self.unwrapped.max_steps = max_env_steps

    def observation(self, obs):
        # this method is called in the step() function to get the observation
        # we provide code that gets the grid state and places the agent in it
        env = self.unwrapped
        full_grid = env.grid.encode()
        full_grid[env.agent_pos[0]][env.agent_pos[1]] = np.array([
            OBJECT_TO_IDX['agent'],
            COLOR_TO_IDX['red'],
            env.agent_dir
        ])
        full_grid = full_grid[1:-1, 1:-1]   # remove outer walls of the environment (for efficiency)
        
        flattened_grid = full_grid.ravel()
        return flattened_grid
    
    def render(self, *args, **kwargs):
        """This removes the default visualization of the partially observable field of view."""
        kwargs['highlight'] = False
        return self.unwrapped.render(*args, **kwargs)

In [1739]:
def to_coords(state):
    idx = state[0:-1:3]
    n = int(np.sqrt(len(idx)))
    for i in range(n):
        for j in range(n):
            k = n*i + j
            if (idx[k] == 10) or (idx[k] == 8):
                return np.array([i+1, j+1])

class RandomPosAndGoalWrapper(gym.Wrapper):
    
    def reset(self):
        
        # Generate goal state
        self.env.unwrapped.agent_pos = np.array([6, 6])# np.random.randint(1, self.grid_size-2, 2) 
        self.goal_state = self.env.observation(self.env.unwrapped.gen_obs())
        goal_pos = to_coords(self.goal_state)
        
        # Set initial state
        self.env.reset()
        radius = 6
#         init_pos = goal_pos + np.random.randint(0, radius, 2)
#         init_pos = np.array([max(1, min(init_pos[0], 6)), max(1, min(init_pos[1], 6))])
#         while (init_pos==goal_pos).all():
#             init_pos = goal_pos + np.random.randint(0, radius, 2)
#             init_pos = np.array([max(1, min(init_pos[0], 6)), max(1, min(init_pos[1], 6))])
        
#         init_pos = (goal_pos + np.random.randint(0, radius, 2))%(self.grid_size-2) + 1
#         while (init_pos==goal_pos).all():
#             init_pos = (goal_pos + np.random.randint(0, radius, 2))%(self.grid_size-2) + 1
        init_pos = np.array([1, 1])
        self.env.unwrapped.agent_pos = init_pos # np.array([1, 1]) # np.random.randint(1, self.grid_size-1, 2) # np.array([1, 1])
        
        # Return initial state 
        return self.env.observation(self.env.unwrapped.gen_obs())
    
    def __init__(self, env):
        self.goal_state = None
        self.grid_size = env.unwrapped.grid.encode().shape[0]
        super().__init__(env)
        
        
    def step(self, action):
        state = self.env.observation(self.env.unwrapped.gen_obs())
        next_state, reward, done, info = self.env.step(action)
        cur_pos, next_pos, goal_pos = to_coords(state), to_coords(next_state), to_coords(self.goal_state)
        dist1 = np.linalg.norm(cur_pos - goal_pos)
        dist2 = np.linalg.norm(next_pos - goal_pos)
        
        reward = (dist1 - dist2)
        reward = 1 - np.sqrt(2) if reward == 0 else reward
        
        if (next_pos == goal_pos).all() or (self.step_count  >= self.max_steps):
            done = True
        else: 
            done = False
        
        return next_state, reward, done, info
    
            
from gym.wrappers import Monitor

def wrap_env(env):
    env = Monitor(env, './video', force=True)
    return env

def gen_wrapped_env(env_name):
    env = FlatObsWrapper(gym.make(env_name))
    env = RandomPosAndGoalWrapper(env)
    return wrap_env(env)

In [1740]:
import matplotlib.pyplot as plt
%matplotlib inline

def run_episode(env, agent, train_mode=True, show_steps=False):
    """
    A hepler function for running single episode
    """
    
    state = env.reset() 
    if not train_mode:
        agent.expore = False
        print("From ", to_coords(state), "to", to_coords(env.goal_state))
    score = 0 
    done = False
    
    steps = 0
    while not done:
        steps+=1
        action = agent.act(state, goal_state)                           
        next_state, reward, done, _ = env.step(action)
        agent.update(state, goal_state, action, reward, next_state, done)
        score += reward                                     
        state = next_state 
            
        if (not train_mode) and show_steps:
            img = env.render('rgb_array')
            plt.imshow(img);
            plt.show()
            
    agent.reset_episode()
    env.close()
    
    if not train_mode:
        show_video()
    
    return score, steps

def run_episodes_and_display(env, agent, n_episodes=1000):
    """
    Runs a series of episode and display agent's performance
    """
    
    display_step = 100
    score = 0
    score_sum = 0
    scores = []
    steps = []
    for episode in range(1, n_episodes+1):
        score, step = run_episode(env, agent, train_mode=True)
        score_sum += score
        scores.append(score)
        steps.append(step)
        if episode%display_step==0:
            avg_score = score_sum/display_step
            print("Episode: {}. Average score: {}".format(episode, avg_score))
            score_sum = 0
            
    fig = plt.figure()
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Max Score')
    plt.xlabel('Episode #')
    plt.show()
    
    fig = plt.figure()
    plt.plot(np.arange(len(steps)), steps)
    plt.ylabel('Steps')
    plt.xlabel('Episode #')
    plt.show()

In [1765]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions.categorical import Categorical


class ActorNetwork(nn.Module):
    """
    Actor is a policy network. Given state it evaluates 
    probability of action given state or sample an action
    """
    def __init__(self, state_size, action_size, seed=42, hidden_size1=64, hidden_size2=64):
        super(ActorNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.layer_m1 = nn.Linear(state_size, hidden_size1)
        self.layer_m2 = nn.Linear(hidden_size1, hidden_size2)
        
        self.layer_w1 = nn.Linear(state_size, hidden_size1)
        self.layer_w2 = nn.Linear(hidden_size1, hidden_size2)
        
        self.mu = nn.Linear(2*hidden_size2, action_size)

    def forward(self, states, goal_states, actions=None):
                
        x_m = F.relu(self.layer_m1(goal_states))
        x_m = F.relu(self.layer_m2(x_m))
        
        x_w = F.relu(self.layer_w1(states))
        x_w = F.relu(self.layer_w2(x_w))
        x = torch.cat((x_m, x_w), 1)
        
        probs = F.softmax(self.mu(x), dim=1)
        action_dist = Categorical(probs)
        actions = actions if actions is not None else action_dist.sample()
        action_log_prob = action_dist.log_prob(actions)
        entropy = action_dist.entropy()
        return actions.detach(), action_log_prob.unsqueeze(1), entropy
    
class CriticNetwork(nn.Module):
    """
    Critic network estimates value function
    """

    def __init__(self, state_size, seed=42, hidden_size1=64, hidden_size2=64):
        super(CriticNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.layer_m1 = nn.Linear(state_size, hidden_size1)
        self.layer_m2 = nn.Linear(hidden_size1, hidden_size2)
        
        self.layer_w1 = nn.Linear(state_size, hidden_size1)
        self.layer_w2 = nn.Linear(hidden_size1, hidden_size2)
        self.layer3 = nn.Linear(2*hidden_size2, 1)

    def forward(self, states, goal_states):
        x_m = F.relu(self.layer_m1(goal_states))
        x_m = F.relu(self.layer_m2(x_m))
        
        x_w = F.relu(self.layer_w1(states))
        x_w = F.relu(self.layer_w2(x_w))
        x = torch.cat((x_m, x_w), 1)
        return self.layer3(x)
    
class ActorCritic(nn.Module):
    """
    Actor-critic model
    """
    
    def __init__(self, state_size, action_size):
        super(ActorCritic, self).__init__()
        self.actor = ActorNetwork(state_size, action_size)
        self.critic = CriticNetwork(state_size)
        
    def act(self, states, goal_states):
        actions, _, _ = self.actor.forward(states, goal_states, None)
        if states.dim==1:
            return actions[0, :]
        return actions
    
    def evaluate(self, states, goal_states, actions):
        
        _, logprobs, entropy = self.actor.forward(states, goal_states, actions)
        values = self.critic.forward(states, goal_states)
        return values, logprobs, entropy
    

class Memory():
    """
    Memory buffer for saving trajectories
    """
    
    def __init__(self):
        self.states = []
        self.goal_states = []
        self.next_states = []
        self.actions = []
        self.rewards = []
        self.dones = []
    
    def clear(self):
        del self.states[:]
        del self.goal_states[:]
        del self.next_states[:]
        del self.actions[:]
        del self.rewards[:]
        del self.dones[:]
        
    def get_trajectory(self, actor_i):
        to_torch = lambda arr: torch.from_numpy(arr).to(device).float()
        actor_slice = lambda arr: np.array([a[actor_i] for a in arr])
        actions = to_torch(actor_slice(self.actions))
        states = to_torch(actor_slice(self.states))
        goal_states = to_torch(actor_slice(self.goal_states))
        next_states = to_torch(actor_slice(self.next_states))
        rewards = actor_slice(self.rewards).reshape(-1, 1)
        dones = actor_slice(self.dones).reshape(-1, 1)
        return states, goal_states, actions, rewards, next_states, dones
    
class PPOAgent():
    
    def __init__(self, state_size, action_size, n_actors=1, actor_critic=None, T=128, K_epochs=10, lr=5e-4, 
                 lamb=0.8, gamma=0.97, eps=0.2, c1 = 0.5, c2 = 0.01):
        """Initializes agent object
        
        Args:
         action_size - action space dimensions
         state_size - state space dimensions
         n_actors - number of actors, equals to number of distriburted envirionments
         actor_critic - pretrained actor-critic network
         T - time steps to collect before agent updating
         K_epochs - number of steps while optimizing networcs
         lr - learning rate for Adam optimizer
         lamb - smoothing parameter for generalized advantage estimator
         gamma - decay
         eps - clipping threshold
         c1 - weight for critic loss
         c2 - weight for entropy loss
         
        """
        self.policy = ActorCritic(state_size, action_size).to(device)
        if actor_critic is not None:
            self.policy.load_state_dict(actor_critic.state_dict())
            
        self.policy_old = ActorCritic(state_size, action_size).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        self.memory = Memory()
        self.n_actors = n_actors
        self.T = T
        self.K_epochs = K_epochs
        self.c1 = c1
        self.c2 = c2
        self.lamb = lamb
        self.gamma = gamma
        self.epsilon = eps
        self.mse_loss = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr)

    def act(self, states, goal_states):
        """Takes actions given batch of states
        
        Args:
         states - a batch of states
        
        Returns:
         actions - a batch of actions generated given states
        """
        states = torch.from_numpy(np.vstack([states])).float().to(device)
        goal_states = torch.from_numpy(np.vstack([goal_states])).float().to(device)
        with torch.no_grad():
            actions = self.policy_old.act(states, goal_states)
            return actions.detach().cpu().numpy()
        
    def update(self, states, goal_states, actions, rewards, next_states, dones):
        """Updates actor critic network
        
        Args:
         states - states for parallel agents, shape [n_agents x state_size]
         actions - actions for parallel agents, shape [n_agents x action_size]
         rewards - rewards for parallel agents, shape [n_agents x 1]
         next_states - next states for parallel agents, shape [n_agents x state_size]
         dones - edisode finishing flags for parallel agents, shape [n_agents x 1]
         
        """
              
        # Add to memory untill collect trajectories of length memory_size
        if states.ndim==1:
            states = np.vstack(states)
            goal_states = np.vstack(goal_states)
            actions = np.vstack(actions)
            rewards = np.vstack([rewards])
            dones = np.vstack([dones])
            next_states = np.vstack(next_states)
            
        
        if len(self.memory.states) < self.T:
            self.memory.actions.append(actions)
            self.memory.states.append(states)
            self.memory.goal_states.append(goal_states)
            self.memory.rewards.append(rewards)
            self.memory.next_states.append(next_states)
            self.memory.dones.append(dones)
            return
                
        # Optimize   
        for _ in range(self.K_epochs):
            loss = self._compute_loss()
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
        self.policy_old.load_state_dict(self.policy.state_dict())
        self.memory.clear()
    
    def save(self, f):
        """Saves policy network to file
        
        Args:
         f - output file  
        """
        torch.save(self.policy.state_dict(), f)
        

    def _compute_loss(self):
        
        # Iterate over actors and create batch
        loss = 0
        for actor_i in range(self.n_actors):
            
            states, goal_states, actions, rewards, next_states, dones = self.memory.get_trajectory(actor_i)
            rewards = (rewards - np.mean(rewards)) / (np.std(rewards) + 1e-5)
            rewards_to_go = self._compute_rewards_to_go(rewards, dones)

            values, logprobs, S = self.policy.evaluate(states, goal_states, actions)
            
            with torch.no_grad():
                values_old, logprobs_old, _ = self.policy_old.evaluate(states, goal_states, actions)
                values_next_old, _, _ = self.policy_old.evaluate(next_states, goal_states, None)
                values_old = values_old.detach().cpu().numpy()
                values_next_old = values_next_old.detach().cpu().numpy()
                
            ratios = torch.exp(logprobs - logprobs_old.detach())
            advantages = self._compute_advantages(rewards, values_old, values_next_old, dones)
        
            # Compute surrogate loss with clipping
            s1 = ratios * advantages
            s2 = torch.clamp(ratios, 1 - self.epsilon, 1 + self.epsilon) * advantages
            L_clip = torch.min(s1, s2)

            # Compute MSE loss for value functions
            L_vf = self.mse_loss(values, rewards_to_go)
            
            # Combine losses
            loss += -L_clip.mean() + self.c1*L_vf - self.c2*S.mean()
            
        return loss/self.n_actors
    
    def _compute_advantages(self, rewards, values, next_values, dones):
        td_errors = rewards + self.gamma*next_values*(1-dones) - values
        A, advantages = 0, []
        for t in reversed(range(len(td_errors))):
            A = td_errors[t] + (self.lamb * self.gamma)*A*(1-dones[t])
            advantages.insert(0, A)
        return torch.from_numpy(np.array(advantages)).float().to(device)
        
 
    def _compute_rewards_to_go(self, rewards, dones):
        rewards_to_go = []
        R = 0
        for reward, done in zip(reversed(rewards), reversed(dones)):
            R = reward + self.gamma * R * (1-done)
            rewards_to_go.insert(0, R)
        return torch.from_numpy(np.array(rewards_to_go)).float().reshape(-1, 1).to(device)
    
    
    def reset_episode(self):
        return True

In [None]:
%%time
env_name = 'MiniGrid-Empty-8x8-v0'
env = gen_wrapped_env(env_name)
state_size = env.observation_space.shape[0]
action_size = 3
# agent = DQNAgentGoal(state_size, action_size)
agent = PPOAgent(state_size, action_size)
run_episodes_and_display(env, agent, n_episodes=2000)

In [1750]:
env = gen_wrapped_env(env_name)
run_episode(env, dqn_agent, train_mode=False, show_steps=False)

From  [1 1] to [6 6]


(-32.55129855222072, 256)

In [1713]:
from torch.distributions.categorical import Categorical

inp = torch.randn(2, 3)
probs = F.softmax(inp, dim=1
                 )

action_dist = Categorical(probs)
actions = action_dist.sample()
action_log_prob = action_dist.log_prob(actions)
entropy = action_dist.entropy()
entropy

tensor([1.0818, 0.8707])