# **Installation**

In [1]:
#@title Mount your Google Drive
#@markdown Your work will be stored in a folder called `neurips22_outreach_rl4dm`

import os
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
#@title Set up mount symlink

DRIVE_PATH = '/content/gdrive/My\ Drive/neurips22_outreach_rl4dm'
DRIVE_PYTHON_PATH = DRIVE_PATH.replace('\\', '')
if not os.path.exists(DRIVE_PYTHON_PATH):
  %mkdir $DRIVE_PATH

## the space in `My Drive` causes some issues,
## make a symlink to avoid this
SYM_PATH = '/content/neurips22_outreach_rl4dm'
if not os.path.exists(SYM_PATH):
  !ln -s $DRIVE_PATH $SYM_PATH

In [3]:
#@title apt install requirements
!apt update 
!apt install -y --no-install-recommends \
        build-essential \
        curl \
        git \
        gnupg2 \
        make \
        cmake \
        ffmpeg \
        swig \
        libz-dev \
        unzip \
        zlib1g-dev \
        libglfw3 \
        libglfw3-dev \
        libxrandr2 \
        libxinerama-dev \
        libxi6 \
        libxcursor-dev \
        libgl1-mesa-dev \
        libgl1-mesa-glx \
        libglew-dev \
        libosmesa6-dev \
        lsb-release \
        ack-grep \
        patchelf \
        wget \
        xpra \
        xserver-xorg-dev \
        xvfb \
        python-opengl \
        ffmpeg

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease [1,581 B]
Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:5 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:6 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Get:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:10 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Packages [1,038 kB]
Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:12 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [83.3 kB]
Hit:1

In [4]:
#@title Clone the repo

%cd $SYM_PATH
!git clone https://github.com/chrisyrniu/neurips22_outreach_robot_learning_for_decision_making.git
%cd neurips22_outreach_robot_learning_for_decision_making
%pip install -r requirements_colab.txt
%pip install gym[box2d]==0.25.2

/content/gdrive/My Drive/neurips22_outreach_rl4dm
fatal: destination path 'neurips22_outreach_robot_learning_for_decision_making' already exists and is not an empty directory.
/content/gdrive/My Drive/neurips22_outreach_rl4dm/neurips22_outreach_robot_learning_for_decision_making
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mujoco==2.2.0
  Downloading mujoco-2.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 5.0 MB/s 
Collecting tensorboardX==2.5.1
  Downloading tensorboardX-2.5.1-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 66.9 MB/s 
[?25hCollecting moviepy==1.0.3
  Downloading moviepy-1.0.3.tar.gz (388 kB)
[K     |████████████████████████████████| 388 kB 69.6 MB/s 
[?25hCollecting pyvirtualdisplay==3.0
  Downloading PyVirtualDisplay-3.0-py3-none-any.whl (15 kB)
Collecting swig==4.0.2
  Downloading swig-4.0.2-py2

In [5]:
#@title Set up virtual display

from pyvirtualdisplay import Display

display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7f7d96504690>

# Train an agent with reinforcement learning in several minutes!

In [6]:
#@title First, let's visualize a random agent before training
#@markdown The cheetah can barely move forward!

import gym
from colab_utils import (
    wrap_env,
    show_video
)

env_name = "HalfCheetah-v4"
env = wrap_env(gym.make(env_name, render_mode='rgb_array'))

observation = env.reset()
for i in range(400):
    env.render()
    obs, rew, term, _ = env.step(env.action_space.sample() ) 
    if term:
      break;
            
env.close()
print('Loading video...')
show_video()

  "Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  "Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."


Loading video...


In [7]:
#@title Then, we will use reinforcement learning to learn to control each joint of the cheetah and make it run!
#@markdown Let's tweak some parameters that impact the learning performance.

#@markdown Learning rate represents the step size when you update your model. Your model will learn nothing when your step is too small or too large!
#@markdown You might want to pick a learning rate from {1, 0.1, 0.01, 0.001, 0.0001}.
learning_rate = 0.001 #@param

#@markdown After each training epoch, your model will be evaluated and you can read the evaluation results!
#@markdown You could set it as 2 or 3 when you tune the learning rate.
#@markdown After finding a good learning rate, you could let the trainig run more epochs to achieve a better performance!
num_epochs = 3 #@param

import torch
cuda = torch.cuda.is_available()

In [8]:
#@title Now, the training program is ready to set off! We will use the Soft Actor-Critic algorithm.
#@markdown Please note that the random agent normally gets an episode reward around worse than -100.
!python run_sac.py --task $env_name --actor-lr $learning_rate --critic-lr $learning_rate --epoch $num_epochs

Observations shape: (17,)
Actions shape: (6,)
Action range: -1.0 1.0
  import imp
Epoch #1: 5001it [01:14, 66.75it/s, env_step=5000, len=1000, loss/actor=-16.557, loss/critic1=0.803, loss/critic2=0.783, n/ep=1, n/st=1, rew=-166.38]              
Epoch #1: test_reward: 29.862251 ± 19.342667, best_reward: 29.862251 ± 19.342667 in #1
Epoch #2: 5001it [01:16, 65.57it/s, env_step=10000, len=1000, loss/actor=-29.651, loss/critic1=0.861, loss/critic2=0.820, n/ep=1, n/st=1, rew=874.71]
Epoch #2: test_reward: 1103.191238 ± 295.126104, best_reward: 1103.191238 ± 295.126104 in #2
Epoch #3: 5001it [01:20, 61.76it/s, env_step=15000, len=1000, loss/actor=-43.573, loss/critic1=1.211, loss/critic2=1.130, n/ep=1, n/st=1, rew=1523.57]
Epoch #3: test_reward: 2040.465243 ± 77.928906, best_reward: 2040.465243 ± 77.928906 in #3
{'best_result': '2040.47 ± 77.93',
 'best_reward': 2040.4652429831215,
 'duration': '244.24s',
 'test_episode': 40,
 'test_speed': '3343.69 step/s',
 'test_step': 40000,
 'test_time'

In [11]:
#@title Let's visualize your trained agent!
#@markdown The best model was automatically saved for you during training, and you can find it in the "/log" folder. The model file name is "policy.pth".

#@markdown Please copy the path to your model to the following blank (e.g., /content/neurips22_outreach_rl4dm/neurips22_outreach_robot_learning_for_decision_making/log/HalfCheetah-v4/sac/0/221125-112556/policy.pth).

model_path = "/content/neurips22_outreach_rl4dm/neurips22_outreach_robot_learning_for_decision_making/log/HalfCheetah-v4/sac/0/221125-135305/policy.pth" #@param {type: "string"}
#@markdown The following code will automatically pick

from load_sac import load_sac
from tianshou.data import Batch, to_numpy
import numpy as np

if cuda:
  device = 'cuda'
else:
  device = 'cpu'
policy = load_sac(model_path, env_name, device)
env = wrap_env(gym.make(env_name, render_mode='rgb_array'))
rewards = []

obs = env.reset()
for i in range(400):
    obs = np.array(obs).reshape(1, -1)
    obs = Batch(obs=obs, info=obs)
    result = policy(obs)
    act = to_numpy(result.act)
    act = policy.map_action(act).reshape(-1)
    obs, rew, term, _ = env.step(act) 
    rewards.append(rew)
    if term:
      break

# print('tested single episode reward', np.array(rewards).sum())
env.close()
print('Loading video...')
show_video()



Loaded agent from:  /content/neurips22_outreach_rl4dm/neurips22_outreach_robot_learning_for_decision_making/log/HalfCheetah-v4/sac/0/221125-135305/policy.pth


  "Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  "Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  f"Overwriting existing videos at {self.video_folder} folder "


Loading video...
