# **Installation**

In [None]:
#@title Install system requirements
%%capture
!apt update 
!apt install -y --no-install-recommends \
        build-essential \
        curl \
        git \
        gnupg2 \
        make \
        cmake \
        ffmpeg \
        swig \
        libz-dev \
        unzip \
        zlib1g-dev \
        libglfw3 \
        libglfw3-dev \
        libxrandr2 \
        libxinerama-dev \
        libxi6 \
        libxcursor-dev \
        libgl1-mesa-dev \
        libgl1-mesa-glx \
        libglew-dev \
        libosmesa6-dev \
        lsb-release \
        ack-grep \
        patchelf \
        wget \
        xpra \
        xserver-xorg-dev \
        xvfb \
        python-opengl \
        ffmpeg

In [None]:
#@title Clone the repo and install
%%capture
import os
SYM_PATH = '/content/neurips22_outreach_rl4dm'
if not os.path.exists(SYM_PATH):
  %mkdir $SYM_PATH
%cd $SYM_PATH
!git clone https://github.com/chrisyrniu/neurips22_outreach_robot_learning_for_decision_making.git
%cd neurips22_outreach_robot_learning_for_decision_making
%pip install -r requirements_colab.txt
%pip install gym[box2d]==0.25.2

In [None]:
#@title Set up virtual display
%%capture
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

# **Train an agent with reinforcement learning in several minutes!**

In [None]:
#@title First, let's visualize a random agent before training.
#@markdown The cheetah can barely move forward!

import gym
from colab_utils import (
    wrap_env,
    show_video
)

env_name = "HalfCheetah-v4"
env = wrap_env(gym.make(env_name, render_mode='rgb_array'))

observation = env.reset()
for i in range(200):
    env.render()
    obs, rew, term, _ = env.step(env.action_space.sample() ) 
    if term:
      break;
            
env.close()
print('Loading video...')
show_video()

In [None]:
#@title Then, we will use reinforcement learning to learn to control each joint of the cheetah and make it run!
#@markdown Let's tweak some parameters that impact the learning performance.

#@markdown Learning rate represents the "updating speed" of your model. 
#@markdown Your model will learn nothing when your learning rate is too small or too large!
#@markdown You might want to pick a learning rate from {0.01, 0.001, 0.0001, 0.00001}.
learning_rate = 0.001 #@param

#@markdown Here, each epoch includes 5000 environment steps.
#@markdown For each step, the agent will receive an observation, execute an action, and receive a reward signal.
#@markdown After each training epoch, your model will be evaluated and you can read the evaluation results!

#@markdown You could set it as 2 or 3 when you tune the learning rate.
#@markdown After finding a good learning rate, you could let the trainig run more epochs to achieve a better performance!
num_epochs = 3 #@param

In [None]:
#@title Now, the training program is ready to set off! We will use the Soft Actor-Critic algorithm.
#@markdown We will use tensorboard to monitor the training process. 

#@markdown You could check "test/reward" or "train/reward" to tell if this run is good!
%load_ext tensorboard
%tensorboard --logdir log

In [None]:
#@markdown Click here to start training!

#@markdown You could also directly read information from the current run in the output lines.
import torch
cuda = torch.cuda.is_available()
print('Use GPU') if cuda else print('Use CPU')
!python run_sac.py --task $env_name --actor-lr $learning_rate --critic-lr $learning_rate --epoch $num_epochs

In [None]:
#@title Let's visualize your trained agent!
#@markdown The best model was automatically saved for you during training, and you can find it in the "/log" folder. The model file name is "policy.pth".

#@markdown Please copy the path to your model to the following blank.

model_path = "" #@param {type: "string"}

from load_sac import load_sac
from tianshou.data import Batch, to_numpy
import numpy as np
import torch
cuda = torch.cuda.is_available()
if cuda:
  device = 'cuda'
else:
  device = 'cpu'
policy = load_sac(model_path, env_name, device)
env = wrap_env(gym.make(env_name, render_mode='rgb_array'))
rewards = []

obs = env.reset()
for i in range(200):
    obs = np.array(obs).reshape(1, -1)
    obs = Batch(obs=obs, info=obs)
    result = policy(obs)
    act = to_numpy(result.act)
    act = policy.map_action(act).reshape(-1)
    obs, rew, term, _ = env.step(act) 
    rewards.append(rew)
    if term:
      break

# print('tested single episode reward', np.array(rewards).sum())
env.close()
print('Loading video...')
show_video()

