## CS 330 Homework 3 Installation

The following code blocks will install the required libraries.


## Setup for Google Drive and Required Libraries


In [1]:
#@title Mount Google Drive
#@markdown Your work will be stored in a folder called `cs330_fall2020` by default to prevent Colab instance timeouts 
#@markdown from deleting your edits and requiring you to redownload the mujoco library. Feel free to use this if you want to write out plots.

import os
from google.colab import drive
drive.mount('/content/gdrive')

#@title set up mount symlink

DRIVE_PATH = '/content/gdrive/My\ Drive/cs330_fall2020'
DRIVE_PYTHON_PATH = DRIVE_PATH.replace('\\', '')
if not os.path.exists(DRIVE_PYTHON_PATH):
  %mkdir $DRIVE_PATH

## the space in `My Drive` causes some issues,
## make a symlink to avoid this
SYM_PATH = '/content/cs330_fall2020'
if not os.path.exists(SYM_PATH):
  !ln -s $DRIVE_PATH $SYM_PATH

Mounted at /content/gdrive


In [2]:
#@title Install Requirements
#@markdown Requirements for the assignment and display drivers

# Robot sim
!pip install gym==0.15.4
!pip install pygame

# Various things for render
!apt-get install python-opengl -y
!apt install xvfb -y

# Rendering Environment
!pip install pyvirtualdisplay
!pip install piglet
!sudo apt-get install -y xvfb ffmpeg
!pip install imageio
!pip install PILLOW

Collecting gym==0.15.4
  Downloading gym-0.15.4.tar.gz (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 4.1 MB/s 
Collecting pyglet<=1.3.2,>=1.2.0
  Downloading pyglet-1.3.2-py2.py3-none-any.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 33.1 MB/s 
[?25hCollecting cloudpickle~=1.2.0
  Downloading cloudpickle-1.2.2-py2.py3-none-any.whl (25 kB)
Building wheels for collected packages: gym
  Building wheel for gym (setup.py) ... [?25l[?25hdone
  Created wheel for gym: filename=gym-0.15.4-py3-none-any.whl size=1648483 sha256=dcdc453f025e0b1a303cae7a8e07779fd07aaa7c8f58b2cccda607741cf6cd4c
  Stored in directory: /root/.cache/pip/wheels/27/97/51/3adbfe67f40bce89b8eba2d3b8f42ec1c9f9c1e6305a73510d
Successfully built gym
Installing collected packages: pyglet, cloudpickle, gym
  Attempting uninstall: pyglet
    Found existing installation: pyglet 1.5.0
    Uninstalling pyglet-1.5.0:
      Successfully uninstalled pyglet-1.5.0
  Attempting uninstall: cloudpickle
    Fo

In [3]:
#@title Download Mujoco from an online repository

MJC_PATH = '{}/mujoco'.format(SYM_PATH)
if not os.path.exists(MJC_PATH):
  %mkdir $MJC_PATH
%cd $MJC_PATH
if not os.path.exists(os.path.join(MJC_PATH, 'mujoco200')):
  !wget -q https://www.roboti.us/download/mujoco200_linux.zip
  !unzip -q mujoco200_linux.zip
  %mv mujoco200_linux mujoco200
  %rm mujoco200_linux.zip

/content/gdrive/My Drive/cs330_fall2020/mujoco


In [4]:
#@title Important: ACTION Required BEFORE running this cell
#@markdown Place the mujoco key we have given you into a text file called mjkey.txt 
#@markdown and ensure that the mujoco key is in the Google Drive path `cs330_fall2020/mujoco`.

import os

os.environ['LD_LIBRARY_PATH'] += ':{}/mujoco200/bin'.format(MJC_PATH)
os.environ['MUJOCO_PY_MUJOCO_PATH'] = '{}/mujoco200'.format(MJC_PATH)
os.environ['MUJOCO_PY_MJKEY_PATH'] = '{}/mjkey.txt'.format(MJC_PATH)

## installation on colab does not find *.so files
## in LD_LIBRARY_PATH, copy over manually instead
!cp $MJC_PATH/mujoco200/bin/*.so /usr/lib/x86_64-linux-gnu/

In [5]:
#@title Important system updates for mujoco-py
!apt update 
!apt install -y --no-install-recommends \
        build-essential \
        curl \
        git \
        gnupg2 \
        make \
        cmake \
        ffmpeg \
        swig \
        libz-dev \
        unzip \
        zlib1g-dev \
        libglfw3 \
        libglfw3-dev \
        libxrandr2 \
        libxinerama-dev \
        libxi6 \
        libxcursor-dev \
        libgl1-mesa-dev \
        libgl1-mesa-glx \
        libglew-dev \
        libosmesa6-dev \
        lsb-release \
        ack-grep \
        patchelf \
        wget \
        xpra \
        xserver-xorg-dev \
        xvfb \
        python-opengl \
        ffmpeg > /dev/null 2>&1

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ Packages [67.9 kB]
Get:4 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:8 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:9 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:11 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:13 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:14 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Ge

In [6]:
#@title Clone and install mujoco-py
#@markdown Remember that you need to put the key in the appropriate location as described above
%cd $MJC_PATH
if not os.path.exists('mujoco-py'):
  !git clone https://github.com/openai/mujoco-py.git
%cd mujoco-py
%pip install -e .

## cythonize at the first import
import mujoco_py

/content/gdrive/My Drive/cs330_fall2020/mujoco
/content/gdrive/My Drive/cs330_fall2020/mujoco/mujoco-py
Obtaining file:///content/gdrive/My%20Drive/cs330_fall2020/mujoco/mujoco-py
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting glfw>=1.4.0
  Using cached glfw-2.2.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl (204 kB)
Collecting fasteners~=0.15
  Using cached fasteners-0.16.3-py2.py3-none-any.whl (28 kB)
Installing collected packages: glfw, fasteners, mujoco-py
  Running setup.py develop for mujoco-py
Successfully installed fasteners-0.16.3 glfw-2.2.0 mujoco-py-2.0.2.13


In [7]:
#@title Clone and install multiworld
%cd $SYM_PATH
!git clone https://github.com/vitchyr/multiworld.git

%cd multiworld
%pip install -e .


/content/gdrive/My Drive/cs330_fall2020
fatal: destination path 'multiworld' already exists and is not an empty directory.
/content/gdrive/My Drive/cs330_fall2020/multiworld
Obtaining file:///content/gdrive/My%20Drive/cs330_fall2020/multiworld
Installing collected packages: multiworld
  Running setup.py develop for multiworld
Successfully installed multiworld-0.0.0


In [8]:
#@title Sets up virtual display
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7fb6b4ac5f10>

In [9]:
#@title Check imports and add helper functions for display

import os
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) # error only
import tensorflow as tf
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
import math
import glob
import io
import base64
from IPython.display import HTML

from IPython import display as ipythondisplay
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY"))==0:
    !bash ../xvfb start
    %env DISPLAY=:1

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [10]:
#@title After running, you should see a video play
matplotlib.use('Agg')

env = wrap_env(gym.make("Ant-v2"))

observation = env.reset()
for i in range(10):
    env.render(mode='rgb_array')
    obs, rew, term, _ = env.step(env.action_space.sample() ) 
    if term:
      break;
            
env.close()
print('Loading video...')
show_video()

Loading video...


In [11]:
#@title Random seed is set to be fixed
import tensorflow
tensorflow.random.set_seed(330)

# BitFlip Goal Conditioned RL






## BitFlipEnv

Familiarize yourself with what the bit flip environment does and what each method does.

You do *NOT* need to modify the following cell.

In [None]:
class BitFlipEnv():
    '''bit flipping environment for reinforcement learning.
    The environment is a 1D vector of binary values (state vector).
    At each step, the actor can flip a single bit (0 to 1 or 1 to 0).
    The goal is to flip bits until the state vector matches the
    goal vector (also a 1D vector of binary values). At each step,
    the actor receives a goal of 0 if the state and goal vector
    do not match and a reward of 1 if the state and goal vector
    match.

    Internally the state and goal vector are a numpy array, which
    allows the vectors to be printed by the show_goal and show_state
    methods. When '''

    def __init__(self, num_bits, verbose = False):
        '''Initialize new instance of BitFlip class.
        inputs: num_bits - number of bits in the environment; must
                be an integer
                verbose - prints state and goal vector after each
                          step if True'''

        # check that num_bits is a positive integer
        if (num_bits < 0) or (type(num_bits) != type(0)):
            print("Invalid number of bits -  must be positive integer")
            return

        # number of bits in the environment
        self.num_bits = num_bits
        # randomly set the state vector
        self.state_vector = np.random.randint(0, 2, num_bits)
        # randomly set the goal vector
        self.goal_vector = np.random.randint(0, 2, num_bits)
        # whether to print debugging info
        self.verbose = verbose
        # TODO set dimensions of observation space
        self.observation_space = self.state_vector
        # TODO create action space; may use gym type
        self.action_space = num_bits
        # space of the goal vector
        self.goal_space = self.goal_vector
        # number of steps taken
        self.steps = 0

        return

    def show_goal(self):
        '''Returns the goal as a numpy array. Used for debugging.'''
        return self.goal_vector

    def show_state(self):
        '''Returns the state as a numpy array. Used for debugging.'''
        return self.state_vector

    def reset(self):
        '''resets the environment. Returns a reset state_vector
        and goal_vector as tf tensors'''

        # randomly reset both the state and the goal vectors
        self.state_vector = np.random.randint(0, 2, self.num_bits)
        self.goal_vector = np.random.randint(0, 2, self.num_bits)
        self.steps = 0

        # return as np array
        return self.state_vector, self.goal_vector


    def step(self, action):
        '''take a step and flip one of the bits.

        inputs: action - integer index of the bit to flip
        outputs: state - new state_vector (tensor)
                 reward - 0 if state != goal and 1 if state == goal
                 done - boolean value indicating if the goal has been reached'''
        self.steps += 1


        if action < 0 or action >= self.num_bits:
            # check argument is in range
            print("Invalid action! Must be integer ranging from \
                0 to num_bits-1")
            return

        # flip the bit with index action
        if self.state_vector[action] == 1:
            self.state_vector[action] = 0
        else:
            self.state_vector[action] = 1

        # initial values of reward and done - may change
        # depending on state and goal vectors
        reward = 0
        done = True

        # check if state and goal vectors are identical
        if False in (self.state_vector == self.goal_vector):
            reward = -1
            done = False

        # print additional info if verbose mode is on
        if self.verbose:
            print("Bit flipped:   ", action)
            print("Goal vector:   ", self.goal_vector)
            print("Updated state: ", self.state_vector)
            print("Reward:        ", reward)

        if done:
            #print("Solved in: ", self.steps)
            pass

        # return state as numpy arrays
        # return goal_vector in info field
        return np.copy(self.state_vector), reward, done, self.steps


## Buffer
Familiarize yourself with what the buffer does 

You do *NOT* need to modify the following cell.

In [None]:
import numpy as np
import random
from collections import deque 

class Buffer(object) :

	def __init__(self,size,sample_size):

		self.size = size
		self.sample_size = sample_size
		self.buffer = deque()

	def add(self,state,action,reward,next_state) :
		self.buffer.append((state,action,reward,next_state))

		if len(self.buffer) > self.size:
			self.buffer.popleft()

	def sample(self) :
		if len(self.buffer) < self.sample_size:
			samples = self.buffer
		else:	
			samples = random.sample(self.buffer,self.sample_size)
		
		state = np.reshape(np.array([arr[0] for arr in samples]),[len(samples),-1])
		action = np.array([arr[1] for arr in samples])
		reward = np.array([arr[2] for arr in samples])
		next_state = np.reshape(np.array([arr[3] for arr in samples]),[len(samples),-1])

		return state, action, reward, next_state


## BitFlip Goal Condition RL and Training

Implement the changes you need for Problems 1-3 here in the cells below.

In [None]:
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt


class Model(tf.keras.Model):

  def __init__(self, num_bits):
    super(Model, self).__init__()

    hidden_dim = 256
    self.dense1 = tf.keras.layers.Dense(hidden_dim, activation=tf.nn.relu)
    self.out = tf.keras.layers.Dense(num_bits,activation = None)

  def call(self, inputs):

    x = self.dense1(inputs)
    return self.out(x)


In [None]:
# ************   Helper functions    ************ #

def updateTarget(model, target_model, tau=0.95) :
    model_weights = model.get_weights()
    target_weights = target_model.get_weights()
    new_weights = []
    for i, weight in enumerate(model_weights):
      new_weights.append(tau * target_weights[i] + (1 - tau) * weight)

    target_model.set_weights(new_weights)

def solve_environment(num_bits, model, bit_env, state, goal_state, total_reward):
    '''attempt to solve the bit flipping environment using the current policy

    inputs: num_bits - number of bits to be looped over  # size of an episode
        model - DQN to run prediction on
        bit_env - environment for bitflip
        state - current state
        goal_state - desired state
        total_reward - cumulative reward so far
    '''
    
    # list for recording what happened in the episode
    episode_experience = []
    succeeded = False

    for t in range(num_bits):
      
      # attempt to solve the state - number of steps given to solve the
      # state is equal to the size of the vector
      
      # ======================== TODO modify code ========================
      inputs = np.expand_dims(state, axis=0)
      # forward pass to find action
      out = model(inputs)
      action = np.argmax(out ,axis=1)[0]  # action as an integer
      # add to the episode experience (what happened)
      next_state, reward, done, steps = bit_env.step(action)
      ### NEW UPDATE ######
      next_state = np.concatenate([next_state, goal_state[:num_bits]])
      ### NEW UPDATE ######
      # calculate total reward
      episode_experience.append((state, action, reward, next_state, goal_state))
      total_reward += reward
      # update state
      state = next_state
      # mark that we've finished the episode and succeeded with training
      if done:
          succeeded = True
      # ========================      END TODO       ========================


    return succeeded, episode_experience, total_reward

def solve_environment_no_goal(num_bits, model, bit_env, state, goal_state, total_reward):
    '''attempt to solve the bit_flip env using no goal'''
    
    # list for recording what happened in the episode
    episode_experience = []
    succeeded = False

    for t in range(num_bits):
        # attempt to solve the state - number of steps given to solve the
        # state is equal to the passed argument steps_per_episode.

        inputs = state
        inputs = np.expand_dims(inputs, axis=0)
        # forward pass to find action
        out = model(inputs)
        action = np.argmax(out,axis = 1)
        # take the action
        next_state,reward,done, _ = bit_env.step(action)
        # add to the episode experience (what happened)
        episode_experience.append((state,action,reward,next_state,goal_state))
        # calculate total reward
        total_reward += reward
        # update state
        state = next_state
        # mark that we've finished the episode and succeeded with training
        if done:
            if succeeded:
                continue
            else:
                succeeded = True



    return succeeded, episode_experience, total_reward


def create_her_states_reward(experience, idx, s, s_, num_bits):
    '''
    given an (episode) experience and an index, we set a new goal (the state at time "index") 
    and create a new state, next state and reward in accordance to that goal

    inputs: experience - list of transitions from the last episode
            idx - the index of the goal of the new task
            s - current state
            s_ - next state
            num_bits - size of new goal
    outputs: a tri-tuple of state, reward and next state 
    '''
    _, _, _, s_fin, _ = experience[idx]  
    goal_state = create_her_state(s_fin, s_fin, num_bits)  # set new goal to final state in episode
    state = create_her_state(s, s_fin, num_bits)  # update state (goal-conditioned with new goal)
    next_state = create_her_state(s_, s_fin, num_bits)  # update next state (goal-conditioned with new goal)
    r_ = set_her_reward(next_state, goal_state)
    return state, r_, next_state

def create_her_state(state, goal_state, num_bits):
    her_state = np.copy(state)
    her_state[num_bits:] = goal_state[:num_bits]  # set new goal state
    return her_state

def set_her_reward(next_state, goal_state):
    r = -1  
    if np.equal(next_state, goal_state).all(): # update r if reached new goal
        r = 0   
    return r


def update_replay_buffer(num_bits, num_relabeled, replay_buffer, episode_experience, HER):
    '''adds past experience to the replay buffer. Training is done with episodes from the replay
    buffer. When HER is used, relabeled experiences are also added to the replay buffer

    inputs: num_bits - number of bits to be looped over 
            replay_buffer - the buffer to store past experience in
            episode_experience - list of transitions from the last episode
            HER -  type of hindsight experience replay to use
    modifies: replay_buffer
    outputs: None'''

    for t in range(num_bits) :
        # copy actual experience from episode_experience to replay_buffer

        # ======================== TODO modify code ========================
        s,a,r,s_,g = episode_experience[t]
        # state
        inputs = s
        # next state
        inputs_ = s_
        # add to the replay buffer
        replay_buffer.add(inputs,a,r,inputs_)   # (inputs, a, r, inputs_)

        # when HER is used, each call to update_replay_buffer should add num_relabeled
        # relabeled points to the replay buffer

        if HER == 'None':
            # HER not being used, so do nothing
            continue  # just add to replay buffer

        elif HER == 'final':
            # final - relabel based on final state in episode
            state, r_, next_state = create_her_states_reward(episode_experience, -1, s, s_, num_bits)
            replay_buffer.add(state,a,r_, next_state)   # (inputs, a, r, inputs_)

        elif HER == 'future':
            # future - relabel based on future state. At each timestep t, relabel the
            # goal with a randomly select timestep between t and the end of the
            # episode
            num_goals = min(num_bits - t , num_relabeled)  # create a legal number of goals per step
            g_idxes = random.sample(range(t, num_bits), num_goals)
            for g_idx in g_idxes:
                state, r_, next_state = create_her_states_reward(episode_experience, g_idx, s, s_, num_bits)
                replay_buffer.add(state, a, r_, next_state)

        elif HER == 'random':
             # random - relabel based on a random state in the episode
            num_goals = min(num_bits , num_relabeled)  # create a legal number of goals per step
            g_idxes = random.sample(range(num_bits), num_goals)
            for g_idx in g_idxes:
                state, r_, next_state = create_her_states_reward(episode_experience, g_idx, s, s_, num_bits)
                replay_buffer.add(state, a, r_, next_state)
        # ========================      END TODO       ========================


        else:
            print("Invalid value for Her flag - HER not used")
    return



In [None]:

# ************   Main training loop    ************ #


def flip_bits(num_bits, num_epochs, buffer_size = 1e6, batch_size = 128, 
              num_episodes = 16, num_relabeled = 4, gamma = 0.98, log_interval=5, opt_steps=40, HER = "None"):
    '''Main loop for running in the bit flipping environment. The DQN is
    trained over num_epochs. In each epoch, the agent runs in the environment
    num_episodes number of times. The Q-target and Q-policy networks are
    updated at the end of each epoch. Within one episode, Q-policy attempts
    to solve the environment and is limited to the same number as steps as the
    size of the environment

    inputs: HER - string specifying whether to use HER'''

    print("Running bit flip environment with %d bits and HER policy: %s" %(num_bits, HER))

    # create bit flipping environment and replay buffer
    bit_env = BitFlipEnv(num_bits)
    replay_buffer = Buffer(buffer_size,batch_size)

    # set up Q-policy (model) and Q-target (target_model)
    model = Model(num_bits)
    optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-3)
    target_model = Model(num_bits)

    # ======================== TODO modify code ========================
    # modify to be goal conditioned
    state, s = bit_env.reset()
    ########  IF HER
    state = np.concatenate([state, goal_state])  # state: (n,) -> (2*n,)
    goal_state = np.concatenate([goal_state, goal_state])  # goal_state: (n,) -> (2*n,)
    inputs = state
    inputs = np.expand_dims(inputs, axis=0)  # (1, 2 * n)
    model(inputs)
    target_model(inputs)

    # start by making Q-target and Q-policy the same
    updateTarget(model, target_model, tau=0.0)
    # ========================      END TODO       ========================


    total_loss = []                  # training loss for each epoch
    success_rate = []                # success rate for each epoch
    
    for i in range(num_epochs):
        # Run for a fixed number of epochs

        total_reward = 0.0           # total reward for the epoch
        successes = []               # record success rate for each episode of the epoch
        losses = []                  # loss at the end of each epoch

        for k in range(num_episodes):
            # Run in the environment for num_episodes  
            state, goal_state = bit_env.reset()             # reset the environment     
            # attempt to solve the environment
            # ======================== TODO modify code ========================
            # modify to be goal conditioned
            state = np.concatenate([state, goal_state])
            goal_state = np.concatenate([goal_state, goal_state])
            # succeeded, episode_experience, total_reward = solve_environment_no_goal(num_bits, model, bit_env, state, goal_state, total_reward)
            succeeded, episode_experience, total_reward = solve_environment(num_bits, model, bit_env, state, goal_state, total_reward)
            # ========================     END TODO     ========================
            successes.append(succeeded)                     # track whether we succeeded in environment 
            update_replay_buffer(num_bits, num_relabeled, replay_buffer, episode_experience, HER)   # add to the replay buffer; use specified  HER policy
        for k in range(opt_steps):
            # optimize the Q-policy network

            # sample from the replay buffer
            state,action,reward,next_state = replay_buffer.sample()
            # forward pass through target network   
            # target_net_Q = sess.run(target_model.out,feed_dict = {target_model.inp : next_state})
            with tf.GradientTape() as tape:
              # target_input = np.expand_dims(next_state, axis=0)
              # target_net_Q = target_model(target_input)
              target_net_Q = target_model(next_state)
              # calculate target reward
              target_reward = np.clip(np.reshape(reward,[-1]) + gamma * np.reshape(np.max(target_net_Q,axis = -1),[-1]),-1. / (1 - gamma), 0)
              # calculate predictions and loss
            #   inputs = np.expand_dims(state, axis=0)
            #   model_predict = target_model(inputs)
              model_predict = model(state)
              model_action_taken = np.reshape(action,[-1])
              action_one_hot = tf.one_hot(model_action_taken, num_bits)
              Q_val = tf.reduce_sum(model_predict * action_one_hot, axis=1)
              loss = tf.reduce_mean(tf.square(Q_val - target_reward))
              losses.append(loss)
            
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
            
        updateTarget(model, target_model)               # update target model by copying Q-policy to Q-target      
        success_rate.append(np.mean(successes))       # append mean success rate for this epoch

        if i % log_interval == 0:
            print('Epoch: %d  Cumulative reward: %f  Success rate: %.4f Mean loss: %.4f' % (i, total_reward, np.mean(successes), np.mean(losses)))
                
    return success_rate

In [None]:
success_rate  = flip_bits(num_bits=7, num_epochs=150, HER='None') 

Running bit flip environment with 7 bits and HER policy: None
Epoch: 0  Cumulative reward: -112.000000  Success rate: 0.0000 Mean loss: 0.1603
Epoch: 5  Cumulative reward: -112.000000  Success rate: 0.0000 Mean loss: 0.0051
Epoch: 10  Cumulative reward: -112.000000  Success rate: 0.0000 Mean loss: 0.0029
Epoch: 15  Cumulative reward: -112.000000  Success rate: 0.0000 Mean loss: 0.0024
Epoch: 20  Cumulative reward: -110.000000  Success rate: 0.0625 Mean loss: 0.0048
Epoch: 25  Cumulative reward: -112.000000  Success rate: 0.0000 Mean loss: 0.0037
Epoch: 30  Cumulative reward: -109.000000  Success rate: 0.0625 Mean loss: 0.0066
Epoch: 35  Cumulative reward: -109.000000  Success rate: 0.0625 Mean loss: 0.0048
Epoch: 40  Cumulative reward: -108.000000  Success rate: 0.0625 Mean loss: 0.0072
Epoch: 45  Cumulative reward: -112.000000  Success rate: 0.0000 Mean loss: 0.0063
Epoch: 50  Cumulative reward: -110.000000  Success rate: 0.0625 Mean loss: 0.0070
Epoch: 55  Cumulative reward: -101.000

In [None]:
from IPython.display import HTML
from plotly import graph_objs as go
def plot_success(exp_to_accuracies, text):
    # Creates the Figure
    fig = go.Figure()
    data = []
    for experiment, accuracies in exp_to_accuracies.items():
        steps = range(len(accuracies))
        steps = [5 * x for x in steps]
        data.append(go.Scatter(x=steps, y=accuracies, line_shape='spline', name=experiment))

    # Applies a custom layout
    layout = go.Layout(
        title=go.layout.Title(
            text=text,
            x=0.5
        ),
        xaxis=go.layout.XAxis(
            title=go.layout.xaxis.Title(
                text='Epoch',
                font=dict(
                    family='Courier New, monospace',
                    size=18,
                    color='#7f7f7f'
                )
            )
        ),
        yaxis=go.layout.YAxis(
            title=go.layout.yaxis.Title(
                text='Success Rate',
                font=dict(
                    family='Courier New, monospace',
                    size=18,
                    color='#7f7f7f'
                )
            )
        )
    )

    fig = go.Figure(data=data, layout=layout)
    return fig
    

In [None]:
type(success_rate), len(success_rate)

(list, 150)

In [None]:
# Sample commands have been provided to you below
# run with type of HER specified
success_rate  = flip_bits(num_bits=7, num_epochs=250, HER='None') 
success_rate_final = flip_bits(num_bits=7, num_epochs=250, HER='final')
# pass success rate for each run as first argument and labels as second list

Running bit flip environment with 7 bits and HER policy: None
Epoch: 0  Cumulative reward: -112.000000  Success rate: 0.0000 Mean loss: 0.1779
Epoch: 5  Cumulative reward: -112.000000  Success rate: 0.0000 Mean loss: 0.0053
Epoch: 10  Cumulative reward: -112.000000  Success rate: 0.0000 Mean loss: 0.0024
Epoch: 15  Cumulative reward: -109.000000  Success rate: 0.0625 Mean loss: 0.0042
Epoch: 20  Cumulative reward: -112.000000  Success rate: 0.0000 Mean loss: 0.0033
Epoch: 25  Cumulative reward: -112.000000  Success rate: 0.0000 Mean loss: 0.0022
Epoch: 30  Cumulative reward: -112.000000  Success rate: 0.0000 Mean loss: 0.0041
Epoch: 35  Cumulative reward: -109.000000  Success rate: 0.0625 Mean loss: 0.0027
Epoch: 40  Cumulative reward: -112.000000  Success rate: 0.0000 Mean loss: 0.0038
Epoch: 45  Cumulative reward: -109.000000  Success rate: 0.0625 Mean loss: 0.0042
Epoch: 50  Cumulative reward: -109.000000  Success rate: 0.0625 Mean loss: 0.0045
Epoch: 55  Cumulative reward: -109.000

In [None]:
exp_to_accuracies = {
    "bitflip_7_her_none": success_rate,
    "bitflip_7_her_final": success_rate_final
}
text = 'Bitflip with 7 bits'
fig = plot_success(exp_to_accuracies, text)
HTML(fig.to_html())

Output hidden; open in https://colab.research.google.com to view.

In [None]:
success_rate = flip_bits(num_bits=15, num_epochs=500, HER='None')
success_rate_final = flip_bits(num_bits=15, num_epochs=500, HER='final')

Running bit flip environment with 15 bits and HER policy: None
Epoch: 0  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.1258
Epoch: 5  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.0053
Epoch: 10  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.0025
Epoch: 15  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.0017
Epoch: 20  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.0013
Epoch: 25  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.0013
Epoch: 30  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.0011
Epoch: 35  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.0012
Epoch: 40  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.0010
Epoch: 45  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.0010
Epoch: 50  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.0010
Epoch: 55  Cumulative reward: -240.00

In [None]:
exp_to_accuracies = {
    "bitflip_15_her_none": success_rate,
    "bitflip_15_her_final": success_rate_final
}
text = 'Bitflip with 15 bits'
fig = plot_success(exp_to_accuracies, text)
HTML(fig.to_html())

Output hidden; open in https://colab.research.google.com to view.

In [None]:
success_rate = flip_bits(num_bits=25, num_epochs=1000, HER='None')
success_rate_final = flip_bits(num_bits=25, num_epochs=1000, HER='final')

Running bit flip environment with 25 bits and HER policy: None
Epoch: 0  Cumulative reward: -400.000000  Success rate: 0.0000 Mean loss: 0.1010
Epoch: 5  Cumulative reward: -400.000000  Success rate: 0.0000 Mean loss: 0.0062
Epoch: 10  Cumulative reward: -400.000000  Success rate: 0.0000 Mean loss: 0.0025
Epoch: 15  Cumulative reward: -400.000000  Success rate: 0.0000 Mean loss: 0.0020
Epoch: 20  Cumulative reward: -400.000000  Success rate: 0.0000 Mean loss: 0.0016
Epoch: 25  Cumulative reward: -400.000000  Success rate: 0.0000 Mean loss: 0.0014
Epoch: 30  Cumulative reward: -400.000000  Success rate: 0.0000 Mean loss: 0.0014
Epoch: 35  Cumulative reward: -400.000000  Success rate: 0.0000 Mean loss: 0.0015
Epoch: 40  Cumulative reward: -400.000000  Success rate: 0.0000 Mean loss: 0.0013
Epoch: 45  Cumulative reward: -400.000000  Success rate: 0.0000 Mean loss: 0.0016
Epoch: 50  Cumulative reward: -400.000000  Success rate: 0.0000 Mean loss: 0.0012
Epoch: 55  Cumulative reward: -400.00

In [None]:
exp_to_accuracies = {
    "bitflip_25_her_none": success_rate,
    "bitflip_25_her_final": success_rate_final
}
text = 'Bitflip with 25 bits'
fig = plot_success(exp_to_accuracies, text)
HTML(fig.to_html())

Output hidden; open in https://colab.research.google.com to view.

In [None]:
success_rate = flip_bits(num_bits=15, num_epochs=500, HER='None')
success_rate_final = flip_bits(num_bits=15, num_epochs=500, HER='final')
success_rate_future = flip_bits(num_bits=15, num_epochs=500, HER='future')
success_rate_random = flip_bits(num_bits=15, num_epochs=500, HER='random')

Running bit flip environment with 15 bits and HER policy: None
Epoch: 0  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.1270
Epoch: 5  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.0032
Epoch: 10  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.0020
Epoch: 15  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.0012
Epoch: 20  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.0013
Epoch: 25  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.0013
Epoch: 30  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.0012
Epoch: 35  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.0011
Epoch: 40  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.0011
Epoch: 45  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.0012
Epoch: 50  Cumulative reward: -240.000000  Success rate: 0.0000 Mean loss: 0.0012
Epoch: 55  Cumulative reward: -240.00

In [None]:
exp_to_accuracies = {
    "bitflip_15_her_none": success_rate,
    "bitflip_15_her_final": success_rate_final,
    "bitflip_15_her_future": success_rate_future,
    "bitflip_15_her_random": success_rate_random,
}
text = 'Bitflip with 15 bits'
fig = plot_success(exp_to_accuracies, text)
HTML(fig.to_html())

Output hidden; open in https://colab.research.google.com to view.

# Sawyer Environment Goal-Conditioned RL

In [12]:
#@title Buffer
#@markdown Same as the Buffer class before but placed here in the event you want to run the sections separately
import numpy as np
import random
from collections import deque 

class Buffer(object) :

	def __init__(self,size,sample_size):

		self.size = size
		self.sample_size = sample_size
		self.buffer = deque()

	def add(self,state,action,reward,next_state) :
		self.buffer.append((state,action,reward,next_state))

		if len(self.buffer) > self.size:
			self.buffer.popleft()

	def sample(self) :
		if len(self.buffer) < self.sample_size:
			samples = self.buffer
		else:	
			samples = random.sample(self.buffer,self.sample_size)
		
		state = np.reshape(np.array([arr[0] for arr in samples]),[len(samples),-1])
		action = np.array([arr[1] for arr in samples])
		reward = np.array([arr[2] for arr in samples])
		next_state = np.reshape(np.array([arr[3] for arr in samples]),[len(samples),-1])

		return state, action, reward, next_state



In [13]:
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
import multiworld
import glfw

multiworld.register_all_envs()   

class Model(tf.keras.Model):

  def __init__(self, num_act):
    super(Model, self).__init__()

    hidden_dim = 256
    self.dense1 = tf.keras.layers.Dense(hidden_dim, activation=tf.nn.relu)
    self.out = tf.keras.layers.Dense(num_act,activation = None)

  def call(self, inputs):
    x = self.dense1(inputs)
    return self.out(x)

pygame 2.0.1 (SDL 2.0.14, Python 3.7.12)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [34]:
# ************   Helper functions    ************ #
# Globals

NUM_DIM = 2
NUM_ACT = 4
done_threshold = -0.01
Sawyer_Env = env = wrap_env(gym.make('SawyerReachXYEnv-v1'))

def updateTarget(model, target_model, tau=0.95) :
    model_weights = model.get_weights()
    target_weights = target_model.get_weights()
    new_weights = []
    for i, weight in enumerate(model_weights):
      new_weights.append(tau * target_weights[i] + (1 - tau) * weight)

    target_model.set_weights(new_weights)

def take_action(action, render):
    '''passes the discrete action selected by the Q-network to the Sawyer Arm.
    The function returns the next state, the reward, and whether the environment
    was solved. The environment done returned is not the same as the environment
    done returned by the Sawyer environment. Due to discretization, it may not be
    possible to exactly reach the goal. The done flag returns true if the end
    state is within done_threshold of the final goal

    inputs:  action - integer (0 to NUM_ACT-1) selected by the Q-network
    outputs: next_state - new state (x, y) location of arm
             reward - reward returned by Sawyer environment
             done - boolean whether environment is solved'''

    # maps actions selected by Q-network to Sawyer arm actions
    # array MUST be length NUM_ACT
    action_dic = {0:[-1, 0], 1:[1, 0], 2:[0, -1], 3:[0, 1]}
    # look up which action in Sawyer arm space corresponds to the selected integer action
    action_sawyer = np.array(action_dic[action], dtype=np.float32)
    # take the action
    ob, reward, done, info = Sawyer_Env.step(action_sawyer)
    # if rendering is turned on, render the environment
    if render:
        Sawyer_Env.render(mode='rgb_array')
    # check if we're "close enough" to declare done
    if reward > done_threshold:
        done = True

    # pull the observed state off
    next_state = ob['observation'][0:2]

    return next_state, reward, done, info

def solve_environment(model, state, goal_state, total_reward, steps_per_episode, render):
    '''attempt to solve the Sawyer Arm environment using the current policy'''
    
    # list for recording what happened in the episode
    episode_experience = []
    succeeded = False

    for t in range(steps_per_episode):
      # attempt to solve the state - number of steps given to solve the
      # state is equal to the passed argument steps_per_episode.

      # ======================== TODO modify code ========================
      inputs = state
      inputs = np.expand_dims(inputs, axis=0)
      inputs = np.array(inputs, dtype=np.float32) 
      # forward pass to find action
      out = model(inputs)
      # take the action - use helper function to convert discrete actions to
      # actions in the Sawyer environment
      action = np.argmax(out,axis = 1)[0]
      next_state,reward,done, _ = take_action(action, render)
      # add to the episode experience (what happened)
      ### NEW UPDATE ######
      goal_orig_size = int(goal_state.size / 2)
      next_state = np.concatenate([next_state, goal_state[:goal_orig_size]])
      ### NEW UPDATE ######
      # calculate total reward
      episode_experience.append((state, action, reward, next_state, goal_state))
      total_reward += reward
      # update state
      state = next_state
      # mark that we've finished the episode and succeeded with training
      if done:
          succeeded = True

      

      # add to the episode experience (what happened)

      # calculate total reward

      # update state

      # mark that we've finished the episode and succeeded with training

      # ========================      END TODO       ========================


    return succeeded, episode_experience, total_reward


def solve_environment_no_goal(model, state, goal_state, total_reward, steps_per_episode, render):
    '''attempt to solve the Sawyer Arm environment using the current policy with no goal condition'''
    
    # list for recording what happened in the episode
    episode_experience = []
    succeeded = False

    for t in range(steps_per_episode):
        inputs = state
        inputs = np.expand_dims(inputs, axis=0)
        inputs = np.array(inputs, dtype=np.float32) 
        # forward pass to find action
        out = model(inputs)
        action = np.argmax(out,axis = 1)[0]
        next_state,reward,done, _ = take_action(action, render)
        # add to the episode experience (what happened)
        episode_experience.append((state,action,reward,next_state,goal_state))
        # calculate total reward
        total_reward += reward
        # update state
        state = next_state
        # mark that we've finished the episode and succeeded with training
        if done:
            if succeeded:
                continue
            else:
                succeeded = True
    else:
         env.stats_recorder.save_complete()
         env.stats_recorder.done = True

    return succeeded, episode_experience, total_reward


def create_sawyer_her_states_reward(experience, idx, s, s_, vec_orig_size):
    '''
    given an (episode) experience and an index, we set a new goal (the state at time "index") 
    and create a new state, next state and reward in accordance to that goal

    inputs: experience - list of transitions from the last episode
            idx - the index of the goal of the new task
            s - current state
            s_ - next state
            vec_orig_size - size of new goal
    outputs: a tri-tuple of state, reward and next state 
    '''
    _, _, _, s_fin, _ = experience[idx]  
    goal_state = create_her_state(s_fin, s_fin, vec_orig_size)  # set new goal to final state in episode
    state = create_her_state(s, s_fin, vec_orig_size)  # update state (goal-conditioned with new goal)
    next_state = create_her_state(s_, s_fin, vec_orig_size)  # update next state (goal-conditioned with new goal)
    reward = set_sawyer_her_reward(next_state, goal_state)
    return state, reward, next_state


def create_her_state(state, goal_state, vec_orig_size):
    her_state = np.copy(state)
    her_state[vec_orig_size:] = goal_state[:vec_orig_size]  # set new goal state
    return her_state

def set_sawyer_her_reward(next_state, goal_state):
    r = - np.sqrt(np.sum((next_state - goal_state) ** 2))  # reward as negative euclidean distance
    return r

def update_replay_buffer(steps_per_episode, num_relabeled, replay_buffer, episode_experience, HER):
    '''adds past experience to the replay buffer. Training is done with episodes from the replay
    buffer. When HER is used, num_relabeled additional relabeled data points are also added
    to the replay buffer

    inputs:    epsidode_experience - list of transitions from the last episode
    modifies:  replay_buffer
    outputs:   None'''
    for t in range(steps_per_episode) :
        # copy actual experience from episode_experience to replay_buffer

        # ======================== TODO modify code ========================
        s,a,r,s_,g = episode_experience[t]
        # state
        inputs = s
        # next state
        inputs_ = s_
        # add to the replay buffer
        replay_buffer.add(inputs,a,r,inputs_)


        # when HER is used, each call to update_replay_buffer should add num_relabeled
        # relabeled points to the replay buffer per step
        if HER == 'None':
            # HER not being used, so do nothing
            continue

        elif HER == 'final':
            # final - relabel based on final state in episode
            vec_orig_size = int(g.size / 2)
            s_fin, r_fin, next_s_fin = create_sawyer_her_states_reward(episode_experience, -1, s, s_, vec_orig_size)
            replay_buffer.add(s_fin, a, r_fin, next_s_fin)

        elif HER == 'future':
            # future - relabel based on future state. At each timestep t, relabel the
            # goal with a randomly select timestep between t and the end of the
            # episode
            vec_orig_size = int(g.size / 2)
            num_goals = min(steps_per_episode - t , num_relabeled)  # create a legal number of goals per step
            g_idxes = random.sample(range(t, steps_per_episode), num_goals)
            for g_idx in g_idxes:
                s_fut, r_fut, next_s_fut = create_sawyer_her_states_reward(episode_experience, g_idx, s, s_, vec_orig_size)
                replay_buffer.add(s_fut, a, r_fut, next_s_fut)

        elif HER == 'random':
            # random - relabel based on a random state in the episode
            vec_orig_size = int(g.size / 2)
            num_goals = min(steps_per_episode , num_relabeled)  # create a legal number of goals per step
            g_idxes = random.sample(range(steps_per_episode), num_goals)
            for g_idx in g_idxes:
                s_rand, r_rand, next_s_rand = create_sawyer_her_states_reward(episode_experience, g_idx, s, s_, vec_orig_size)
                replay_buffer.add(s_rand, a, r_rand, next_s_rand)

        # ========================      END TODO       ========================

        else:
            print("Invalid value for Her flag - HER not used")
    return


In [24]:
# ************   Main Training Loop    ************ #

def run_sawyer(num_epochs, buffer_size=1e6, batch_size=128, 
               num_episodes=16, num_relabeled=4, gamma=0.98, log_interval=5, opt_steps=40,
               steps_per_episode=50, render=False, HER="None"):
    '''Main loop for running in the Sawyer environment. The DQN is
    trained over num_epochs. In each epoch, the agent runs in the environment
    num_episodes number of times. The Q-target and Q-policy networks are
    updated at the end of each epoch. Within one episode, Q-policy attempts
    to solve the environment and is limited to the same number as steps as the
    size of the environment

    inputs: HER - string specifying whether to use HER'''
    # create Sawyer arm environment and replay buffer
    replay_buffer = Buffer(buffer_size,batch_size)

    # set up Q-policy (model) and Q-target (target_model)
    model = Model(NUM_ACT)
    target_model = Model(NUM_ACT)

    optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-3)

    # ======================== TODO modify code ========================
    # modify to be goal conditioned
    reset_state = Sawyer_Env.reset()  
    state = reset_state['observation'][:2]          # look up the state
    goal_state = reset_state['desired_goal'][:2]
    #########  NEW ############
    state = np.concatenate([state, goal_state])
    goal_state = np.concatenate([goal_state, goal_state])
    #########  NEW ############
    inputs = np.expand_dims(state, axis=0)
    model(inputs)
    target_model(inputs)

    # start by making Q-target and Q-policy the same
    updateTarget(model, target_model, tau=0.0)

    # ========================      END TODO       ========================

    total_loss = []                  # training loss for each epoch
    success_rate = []                # success rate for each epoch
    
    for i in range(num_epochs):
        # Run for a fixed number of epochs

        total_reward = 0.0           # total reward for the epoch
        successes = []               # record success rate for each episode of the epoch
        losses = []                  # loss at the end of each epoch

        for k in range(num_episodes):
            reset_state = Sawyer_Env.reset()                # reset the environment
            state = reset_state['observation'][:2]          # look up the state
            goal_state = reset_state['desired_goal'][:2]    # look up the goal

            # attempt to solve the environment
            # ======================== TODO modify code ========================
            # modify to be goal conditioned
            #########  NEW ############
            state = np.concatenate([state, goal_state])
            goal_state = np.concatenate([goal_state, goal_state])
            #########  NEW ############
            # succeeded, episode_experience, total_reward = solve_environment_no_goal(model, state, goal_state, total_reward, steps_per_episode, render)
            succeeded, episode_experience, total_reward = solve_environment(model, state, goal_state, total_reward, steps_per_episode, render)
            # ========================      END TODO       ========================

            successes.append(succeeded)                     # track whether we succeeded in environment 
            update_replay_buffer(steps_per_episode, num_relabeled, replay_buffer, episode_experience, HER)   # add to the replay buffer; use specified  HER policy
            env.close() 
            glfw.terminate()
        for k in range(opt_steps):
            # optimize the Q-policy network

            # sample from the replay buffer
            state,action,reward,next_state = replay_buffer.sample()
            state = np.array(state, dtype=np.float32) 
            next_state = np.array(next_state, dtype=np.float32) 
            # forward pass through target network   

            with tf.GradientTape() as tape:
              target_net_Q = target_model(next_state)
              # calculate target reward
              target_reward = np.clip(np.reshape(reward,[-1]) + gamma * np.reshape(np.max(target_net_Q,axis = -1),[-1]),-1. / (1 - gamma), 0)
              # calculate loss
              model_predict = model(state)
              model_action_taken = np.reshape(action,[-1])
              action_one_hot = tf.one_hot(model_action_taken, NUM_ACT)
              Q_val = tf.reduce_sum(model_predict * action_one_hot, axis=1)
              loss = tf.reduce_mean(tf.square(Q_val - target_reward))
              losses.append(loss)
            
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
            
        updateTarget(model, target_model)               # update target model by copying Q-policy to Q-target      
        success_rate.append(np.mean(successes))       # append mean success rate for this epoch

        if i % log_interval == 0:
            print('Epoch: %d  Cumulative reward: %f  Success rate: %.4f Mean loss: %.4f' % (i, total_reward, np.mean(successes), np.mean(losses)))
    return success_rate

In [25]:
from IPython.display import HTML
from plotly import graph_objs as go
def plot_success(exp_to_accuracies, text):
    # Creates the Figure
    fig = go.Figure()
    data = []
    for experiment, accuracies in exp_to_accuracies.items():
        steps = range(len(accuracies))
        steps = [5 * x for x in steps]
        data.append(go.Scatter(x=steps, y=accuracies, line_shape='spline', name=experiment))

    # Applies a custom layout
    layout = go.Layout(
        title=go.layout.Title(
            text=text,
            x=0.5
        ),
        xaxis=go.layout.XAxis(
            title=go.layout.xaxis.Title(
                text='Epoch',
                font=dict(
                    family='Courier New, monospace',
                    size=18,
                    color='#7f7f7f'
                )
            )
        ),
        yaxis=go.layout.YAxis(
            title=go.layout.yaxis.Title(
                text='Success Rate',
                font=dict(
                    family='Courier New, monospace',
                    size=18,
                    color='#7f7f7f'
                )
            )
        )
    )

    fig = go.Figure(data=data, layout=layout)
    return fig
    

In [19]:
#@title Runs
#@markdown run on Sawyer Environment (no goal-conditioned and no HER)
success_rate = run_sawyer(num_epochs=150, HER = "None")

Epoch: 0  Cumulative reward: -107.565840  Success rate: 0.0625 Mean loss: 0.0048
Epoch: 5  Cumulative reward: -128.197736  Success rate: 0.0625 Mean loss: 0.0052
Epoch: 10  Cumulative reward: -143.172667  Success rate: 0.0000 Mean loss: 0.0041
Epoch: 15  Cumulative reward: -124.509480  Success rate: 0.0625 Mean loss: 0.0040
Epoch: 20  Cumulative reward: -112.911280  Success rate: 0.0000 Mean loss: 0.0037
Epoch: 25  Cumulative reward: -118.367723  Success rate: 0.0000 Mean loss: 0.0037
Epoch: 30  Cumulative reward: -110.712553  Success rate: 0.0000 Mean loss: 0.0037
Epoch: 35  Cumulative reward: -118.898256  Success rate: 0.0625 Mean loss: 0.0035
Epoch: 40  Cumulative reward: -144.065362  Success rate: 0.0000 Mean loss: 0.0036
Epoch: 45  Cumulative reward: -110.821143  Success rate: 0.0625 Mean loss: 0.0035
Epoch: 50  Cumulative reward: -129.904861  Success rate: 0.0000 Mean loss: 0.0035
Epoch: 55  Cumulative reward: -126.549859  Success rate: 0.0625 Mean loss: 0.0034
Epoch: 60  Cumulat

In [None]:
# If you chose to render:
# show_video()

In [22]:
exp_to_accuracies = {
    "sawyer_none": success_rate,
}
text = 'Sawyer'
fig = plot_success(exp_to_accuracies, text)
HTML(fig.to_html())

Output hidden; open in https://colab.research.google.com to view.

In [32]:
#@title Goal Conditioned runs
#@markdown runs on Sawyer Environment with 4 variations of HER
#@markdown * **None**- do nothing
#@markdown * **Final**- for each sample in episode create another sample where the last state (in episode) is set to be the new goal
#@markdown * **Future**- for each sample in episode choose randomly num_relabeled "tasks" where the goals are drawn from future states in episode
#@markdown * **Random**- for each sample in episode choose randomly num_relabeled "tasks" where the goals are drawn from all states in episode
success_rate = run_sawyer(num_epochs=150, HER = "None")  # , render=True)
# show_video()
success_rate_final = run_sawyer(num_epochs=150, HER = "final")  # , render=True)
# show_video()
success_rate_future = run_sawyer(num_epochs=150, HER = "future")  # , render=True)
# show_video()
success_rate_random = run_sawyer(num_epochs=150, HER = "random")  # , render=True)
# show_video()

Epoch: 0  Cumulative reward: -165.408168  Success rate: 0.0625 Mean loss: 0.0088
Epoch: 5  Cumulative reward: -112.240136  Success rate: 0.0000 Mean loss: 0.0007
Epoch: 10  Cumulative reward: -110.875117  Success rate: 0.0000 Mean loss: 0.0004
Epoch: 15  Cumulative reward: -107.054125  Success rate: 0.0000 Mean loss: 0.0003
Epoch: 20  Cumulative reward: -91.146846  Success rate: 0.0000 Mean loss: 0.0002
Epoch: 25  Cumulative reward: -73.888707  Success rate: 0.0000 Mean loss: 0.0002
Epoch: 30  Cumulative reward: -77.863155  Success rate: 0.0625 Mean loss: 0.0002
Epoch: 35  Cumulative reward: -58.276404  Success rate: 0.0625 Mean loss: 0.0002
Epoch: 40  Cumulative reward: -57.413492  Success rate: 0.1250 Mean loss: 0.0001
Epoch: 45  Cumulative reward: -57.647278  Success rate: 0.0000 Mean loss: 0.0001
Epoch: 50  Cumulative reward: -56.713131  Success rate: 0.0625 Mean loss: 0.0001
Epoch: 55  Cumulative reward: -51.096889  Success rate: 0.1250 Mean loss: 0.0001
Epoch: 60  Cumulative rewa

NameError: ignored

In [35]:
success_rate_future = run_sawyer(num_epochs=150, HER = "future")  # , render=True)
# show_video()
success_rate_random = run_sawyer(num_epochs=150, HER = "random")  # , render=True)

Epoch: 0  Cumulative reward: -110.252157  Success rate: 0.0000 Mean loss: 0.0018
Epoch: 5  Cumulative reward: -74.104295  Success rate: 0.0000 Mean loss: 0.0006
Epoch: 10  Cumulative reward: -69.927301  Success rate: 0.1250 Mean loss: 0.0003
Epoch: 15  Cumulative reward: -67.824209  Success rate: 0.1250 Mean loss: 0.0002
Epoch: 20  Cumulative reward: -76.929618  Success rate: 0.1250 Mean loss: 0.0001
Epoch: 25  Cumulative reward: -77.338564  Success rate: 0.0625 Mean loss: 0.0001
Epoch: 30  Cumulative reward: -68.177702  Success rate: 0.0625 Mean loss: 0.0001
Epoch: 35  Cumulative reward: -60.446472  Success rate: 0.0625 Mean loss: 0.0001
Epoch: 40  Cumulative reward: -82.217323  Success rate: 0.3750 Mean loss: 0.0001
Epoch: 45  Cumulative reward: -49.072385  Success rate: 0.2500 Mean loss: 0.0001
Epoch: 50  Cumulative reward: -61.398574  Success rate: 0.3125 Mean loss: 0.0001
Epoch: 55  Cumulative reward: -54.703290  Success rate: 0.1250 Mean loss: 0.0001
Epoch: 60  Cumulative reward:

In [36]:
exp_to_accuracies = {
    "sawyer_her_none": success_rate,
    "sawyer_her_final": success_rate_final,
    "sawyer_her_future": success_rate_future,
    "sawyer_her_random": success_rate_random,
}
text = 'Sawyer (Goal-conditioned)'
fig = plot_success(exp_to_accuracies, text)
HTML(fig.to_html())

Output hidden; open in https://colab.research.google.com to view.

# Plotting Code

We've provided some sample plotting code for you. Feel free to customize it per the assignment specifications. The code will not be graded.

In [None]:
# !pip install plotly

In [None]:
from IPython.display import HTML
from plotly import graph_objs as go

# Sample plotting clode (replace successes where necessary)
exp_to_accuracies = {
    "bitflip_7_her_none": success_rate,
    "bitflip_7_her_ final": success_rate_final
}

# Creates the Figure
fig = go.Figure()
data = []
for experiment, accuracies in exp_to_accuracies.items():
  steps = range(len(accuracies))
  steps = [5 * x for x in steps]
  data.append(go.Scatter(x=steps, y=accuracies, line_shape='spline', name=experiment))

# Applies a custom layout
layout = go.Layout(
    title=go.layout.Title(
        text='Bitflip with 7 bits',
        x=0.5
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text='Epoch',
            font=dict(
                family='Courier New, monospace',
                size=18,
                color='#7f7f7f'
            )
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='Success Rate',
            font=dict(
                family='Courier New, monospace',
                size=18,
                color='#7f7f7f'
            )
        )
    )
)

fig = go.Figure(data=data, layout=layout)

HTML(fig.to_html())

Output hidden; open in https://colab.research.google.com to view.

In [None]:
!pwd

/content/gdrive/My Drive/cs330_fall2020/multiworld
