# **Behavioral Cloning**

## Install & Import packages/libraries

In [1]:
# REMEMBER TO RESTART RUNTIME AFTER RUNNING THIS CELL!
!apt-get update

!pip install gym
!git clone https://github.com/jakevdp/JSAnimation
%cd JSAnimation
!python setup.py install

!apt-get install python-opengl -y
!apt install xvfb -y
!pip install pyvirtualdisplay
!pip install pyglet
!pip install matplotlib==2.0
%cd ..

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:2 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [696 B]
Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Get:9 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:11 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ Packages [68.5 kB]
Hit:12 http://ppa.launchpad.net/cran/

/content


In [1]:
from IPython.display import HTML, display
from gym.wrappers import Monitor
import base64
import glob
import io

from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import numpy as np
import pickle
import gzip
import gym
%matplotlib inline

def show_video():
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display(HTML(data='''<video alt="test" autoplay
                     loop controls style="height: 400px;">
                     <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                     </video>'''.format(encoded.decode('ascii'))))
    else:
        print('No video found.')

def wrap_env(env):
    env = Monitor(env, './video', force=True)
    return env

## Pendulum Visualization

In [2]:
# Get environment information
env_name = "Pendulum-v0"
env = gym.make(env_name)

obs_space = env.observation_space
print('observation space: {}'.format(obs_space))
print('dimension: {}'.format(obs_space.shape[0]))
print('high: {}'.format(obs_space.high))
print('low: {}\n'.format(obs_space.low))

action_space = env.action_space
print('action space: {}'.format(action_space))
print('dimension: {}'.format(action_space.shape[0]))
print('high: {}'.format(action_space.high))
print('low: {}\n'.format(action_space.low))

observation space: Box(-8.0, 8.0, (3,), float32)
dimension: 3
high: [1. 1. 8.]
low: [-1. -1. -8.]

action space: Box(-2.0, 2.0, (1,), float32)
dimension: 1
high: [2.]
low: [-2.]



In [3]:
# Initialize virtual display
from pyvirtualdisplay import Display
Display().start()

<pyvirtualdisplay.display.Display at 0x7fcf9a45fc50>

In [4]:
# Visualize gym environment
env = wrap_env(gym.make(env_name))
obs = env.reset()
total_reward = 0

for t in range(100):
    action = env.action_space.sample() # Random action
    # Try inserting values by uncommenting the following line to see what each action value performs! [-2.0, 2.0]
    # action = np.array([-2.0])
    obs, reward, done, info = env.step(action)
    # Render environment to virtual display
    env.render(mode='rgb_array')
    total_reward += reward
    if done: # done becomes true when environment terminates
        break

env.close()
print('Total Reward: {:.2f}'.format(total_reward))
show_video()

Total Reward: -640.06


## Pendulum Expert Behavior

In [5]:
# Remember to upload "Pendulum-v0_expert.pkl"!
# We will now visualize how an expert's policy on the environment

env_name = "Pendulum-v0"

# Load demonstrations
with open('./' + env_name + '_expert_demo.pkl', 'rb') as f:
    demos = pickle.load(f)
demos = shuffle(demos)

# Check expert's performance
exp_ret = np.mean([np.sum(d['rewards']) for d in demos])
print("Expert's Average Cumulative Rewards: {:.3f}".format(exp_ret))

Expert's Average Cumulative Rewards: -79.331


In [7]:
# Gather the demonstrations' observations and actions
demo_observations = []
demo_actions = []
for demo in demos:
    for t_idx in range(len(demo['observes'])):
        demo_observations.append(demo['observes'][t_idx])
        demo_actions.append(demo['actions'][t_idx])

# Convert lists into numpy arrays
demo_observations = np.asarray(demo_observations)
demo_actions = np.asarray(demo_actions)

# Shuffle data to break correlation
demo_observations, demo_actions = shuffle(demo_observations, demo_actions)
print('There are a total of {} demonstrations!'.format(len(demo_observations)))

######## TO DO #########
# Choose number of demos you want to use!
demo_observations = demo_observations[:200, :]
demo_actions = demo_actions[:200, :]
########################

n = demo_observations.shape[0]
assert demo_observations.shape == (n, 3)
assert demo_actions.shape == (n, 1)

# Print observation and action dimensions
print("Observation data: {}".format(demo_observations.shape))
print("Action data: {}".format(demo_actions.shape))

There are a total of 20000 demonstrations!
Observation data: (200, 3)
Action data: (200, 1)


## Gaussian Process Regression

In [8]:
# We will use GPR to learn from these expert demonstrations by setting them as targets!
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import ConstantKernel, RBF

###### TO DO #######
init_lambda = 10.0
init_beta = 2.0
init_sigma = 0.04
####################

kernel = ConstantKernel(init_beta, (1e-3, 1e3)) * RBF(init_lambda, (1e-2, 1e6))
gp = GaussianProcessRegressor(kernel=kernel, alpha=init_sigma, n_restarts_optimizer=20)

###### TO DO #######
demo_obs_mean = np.mean(demo_observations, axis=0, keepdims=True)
demo_obs_std = np.std(demo_observations, axis=0, keepdims=True)
nz_demo_observations = (demo_observations - demo_obs_mean) / demo_obs_std 
####################

assert demo_obs_mean.shape == (1, 3)
assert demo_obs_std.shape == (1, 3)
n = len(demo_observations)
assert nz_demo_observations.shape == (n, 3)

gp.fit(nz_demo_observations, demo_actions)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


GaussianProcessRegressor(alpha=0.04, copy_X_train=True,
                         kernel=1.41**2 * RBF(length_scale=10),
                         n_restarts_optimizer=20, normalize_y=False,
                         optimizer='fmin_l_bfgs_b', random_state=None)

In [9]:
# Initialize virtual display
from pyvirtualdisplay import Display
Display().start()

<pyvirtualdisplay.display.Display at 0x7fcf88a0a590>

In [10]:
# Visualize cloned behavior
env = wrap_env(gym.make(env_name))
obs = env.reset()
total_reward = 0

for t in range(10000):
    obs = np.reshape(obs, [1, -1])

    ####### TO DO #######
    nz_obs = (obs - demo_obs_mean) / demo_obs_std
    action = gp.predict(nz_obs)
    obs, reward, done, info = env.step(action)
    #####################

    # Render environment to virtual display
    env.render(mode='rgb_array')
    total_reward += reward[0]
    if done: # done becomes true when environment terminates
        break

env.close()

In [11]:
# Screenshot the following and submit!
# MODIFY THE FOLLOWING
######################################
print('\n')
print('=============================')
print('Name: 박건도')
print('Student ID: 2017-11362')
print('Total Reward: {:.2f}'.format(total_reward))
print('=============================')
######################################
show_video()



Name: 박건도
Student ID: 2017-11362
Total Reward: -129.56
