# Random Simple random search for RL

https://arxiv.org/pdf/1803.07055.pdf

https://github.com/jietan/ARS

using PyBullet:
https://github.com/bulletphysics/bullet3/issues/1718
    
https://github.com/YungKC/bullet3/blob/master/examples/pybullet/gym/pybullet_envs/ARS/ars.py

In [None]:
import os
import inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(os.path.dirname(currentdir))
os.sys.path.insert(0,parentdir)

# Importing the libraries
import os
import numpy as np
import gym
from gym import wrappers
import pybullet_envs
import time
import multiprocessing as mp
from multiprocessing import Process, Pipe
import argparse
import pickle


In [None]:
# Setting the Hyper Parameters
class Hp():
    
    def __init__(self):
        self.nb_steps = 10000
        self.episode_length = 5000
        self.learning_rate = 0.02
        self.nb_directions = 16
        self.nb_best_directions = 16
        self.nb_cur_best_directions = self.nb_best_directions
        assert self.nb_best_directions <= self.nb_directions
        self.noise = 0.03
        self.seed = 1
        self.env_name = 'HalfCheetahBulletEnv-v0'

In [None]:
# Multiprocess Exploring the policy on one specific direction and over one episode

_RESET = 1
_CLOSE = 2
_EXPLORE = 3

def ExploreWorker(rank,childPipe, envname):
    env = gym.make(envname)
    nb_inputs = env.observation_space.shape[0]
    normalizer = Normalizer(nb_inputs) #use our local normalizer
    observation_n = env.reset()
    n=0
    while True:
      n+=1
      try:
        # Only block for short times to have keyboard exceptions be raised.
        if not childPipe.poll(0.001):
          continue
        message, payload = childPipe.recv()
      except (EOFError, KeyboardInterrupt):
        break
      if message == _RESET:
        observation_n = env.reset()
        childPipe.send(["reset ok"])
        continue
      if message == _EXPLORE:
        # normalizer = payload[0]
#        if normalizer is None:
        policy = payload[1]
        hp = payload[2]
        direction = payload[3]
        delta = payload[4]
        sum_rewards = explore(env, normalizer, policy, direction, delta, hp)
        childPipe.send([sum_rewards])
        continue
      if message == _CLOSE:
        childPipe.send(["close ok"])
        break
    childPipe.close()


In [None]:
# Normalizing the states

class Normalizer():
    
    def __init__(self, nb_inputs):
        self.n = np.zeros(nb_inputs)
        self.mean = np.zeros(nb_inputs)
        self.mean_diff = np.zeros(nb_inputs)
        self.var = np.zeros(nb_inputs)
    
    def observe(self, x):
        self.n += 1.
        last_mean = self.mean.copy()
        self.mean += (x - self.mean) / self.n
        self.mean_diff += (x - last_mean) * (x - self.mean)
        self.var = (self.mean_diff / self.n).clip(min = 1e-2)
    
    def normalize(self, inputs):
        obs_mean = self.mean
        obs_std = np.sqrt(self.var)
        return (inputs - obs_mean) / obs_std

In [None]:
# Building the AI

class Policy():
    def __init__(self, input_size, output_size, env_name):
        self.env_name = env_name
        self.theta = np.zeros((output_size, input_size))
        
    def evaluate(self, input, delta, direction, hp):
        if direction is None:
            return np.clip(self.theta.dot(input), -1.0, 1.0)
        elif direction == "positive":
            return np.clip((self.theta + hp.noise*delta).dot(input), -1.0, 1.0)
        else:
            return np.clip((self.theta - hp.noise*delta).dot(input), -1.0, 1.0)
    
    def sample_deltas(self):
        return [np.random.randn(*self.theta.shape) for _ in range(hp.nb_directions)]
    
    def update(self, rollouts, sigma_r):
        step = np.zeros(self.theta.shape)
        for r_pos, r_neg, d in rollouts:
            step += (r_pos - r_neg) * d
        self.theta += hp.learning_rate / (hp.nb_cur_best_directions * sigma_r) * step
                

In [None]:
# Exploring the policy on one specific direction and over one episode

def explore(env, normalizer, policy, direction, delta, hp):
    state = env.reset()
    done = False
    num_plays = 0.
    sum_rewards = 0
    while not done and num_plays < hp.episode_length:
        normalizer.observe(state)
        state = normalizer.normalize(state)
        action = policy.evaluate(state, delta, direction, hp)
        state, reward, done, _ = env.step(action)
        reward = max(min(reward, 1), -1)
        sum_rewards += reward
        num_plays += 1
    return sum_rewards

In [None]:
# Training the AI

def train(env, policy, normalizer, hp, parentPipes):
    
    for step in range(hp.nb_steps):
        
        # Initializing the perturbations deltas and the positive/negative rewards
        deltas = policy.sample_deltas()
        positive_rewards = [0] * hp.nb_directions
        negative_rewards = [0] * hp.nb_directions
        
        if parentPipes:
          for k in range(hp.nb_directions):
            parentPipe = parentPipes[k]
            parentPipe.send([_EXPLORE,[normalizer, policy, hp, "positive", deltas[k]]])
          for k in range(hp.nb_directions):
            positive_rewards[k] = parentPipes[k].recv()[0]
          
          for k in range(hp.nb_directions):
            parentPipe = parentPipes[k]
            parentPipe.send([_EXPLORE,[normalizer, policy, hp, "negative", deltas[k]]])
          for k in range(hp.nb_directions):
            negative_rewards[k] = parentPipes[k].recv()[0]
          
        else:
          # Getting the positive rewards in the positive directions
          for k in range(hp.nb_directions):
              positive_rewards[k] = explore(env, normalizer, policy, "positive", deltas[k], hp)
        
          
          # Getting the negative rewards in the negative/opposite directions
          for k in range(hp.nb_directions):
              negative_rewards[k] = explore(env, normalizer, policy, "negative", deltas[k], hp)
            
        
        # Gathering all the positive/negative rewards to compute the standard deviation of these rewards
        all_rewards = np.array(positive_rewards + negative_rewards)
        sigma_r = all_rewards.std()
        
        # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
        scores = {k:max(r_pos, r_neg) for k,(r_pos,r_neg) in enumerate(zip(positive_rewards, negative_rewards))}
        order = sorted(scores.keys(), key = lambda x:-scores[x])[:hp.nb_cur_best_directions]
        rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order]
        
        # Updating our policy
        policy.update(rollouts, sigma_r)
        

        # Printing the final reward of the policy after the update
        for k in range(1):
            reward_evaluation = explore(env, normalizer, policy, None, None, hp)        
            print('Step:', step, 'Reward:', reward_evaluation, 'Sigma:', sigma_r)
       
        score = round(reward_evaluation)
        sigma = round(sigma_r)
        
        log_file.write(f"{step}\t{score}\t{sigma}\n")
        log_file.flush()
        if step%100 == 0:
            pkl_file = open(f"{LOGDIR}/policy_{hp.env_name}_{step}_{score}.pkl", 'wb')
            pickle.dump((policy,normalizer,hp), pkl_file)
            pkl_file.close()
'''
        # Kai - test new logic
        if step%10 == 0:
            hp.nb_cur_best_directions = max(hp.nb_cur_best_directions, 6)

'''

        
                


In [None]:
# Running the main code

def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path


In [None]:
mp.freeze_support()

'''
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='Gym environment name', type=str, default='HalfCheetahBulletEnv-v0')
parser.add_argument('--seed', help='RNG seed', type=int, default=1)
parser.add_argument('--render', help='OpenGL Visualizer', type=int, default=0)
parser.add_argument('--movie',help='rgb_array gym movie',type=int, default=0)
parser.add_argument('--steps', help='Number of steps', type=int, default=10000)
parser.add_argument('--policy', help='Starting policy file (npy)', type=str, default='')
parser.add_argument('--logdir', help='Directory root to log policy files (npy)', type=str, default='.')
parser.add_argument('--mp', help='Enable multiprocessing', type=int, default=1)

args = parser.parse_args()
'''
STEPS = 20000
MULTI_PROCESSING = 1
RENDER = 0
MOVIE = 0

hp = Hp()
hp.env_name = 'HalfCheetahBulletEnv-v0'
hp.seed = 1
hp.nb_steps = STEPS
hp.learning_rate = 0.02
hp.nb_directions = 32
hp.nb_best_directions = 16
hp.nb_cur_best_directions = hp.nb_best_directions
hp.noise = 0.03

print("seed = ", hp.seed)
np.random.seed(hp.seed)



# POLICY = 'policy/policy_HalfCheetahBulletEnv-v0_10000_989.npy'
# POLICY = 'policy/kai_1200_968.npy'
LOGDIR = 'policy'
LOG_FILENAME = f'{LOGDIR}/policy_{hp.env_name}.dat'

# LOAD_POLICY_FILENAME = f'{LOGDIR}/policy_HalfCheetahBulletEnv-v0_10000_989.pkl'
LOAD_POLICY_FILENAME = None






parentPipes = None
if MULTI_PROCESSING:
  num_processes = hp.nb_directions
  processes = []
  childPipes = []
  parentPipes = []

  for pr in range (num_processes):
    parentPipe, childPipe = Pipe()
    parentPipes.append(parentPipe)
    childPipes.append(childPipe)

  for rank in range(num_processes):
      p = mp.Process(target=ExploreWorker, args=(rank,childPipes[rank], hp.env_name))
      p.start()
      processes.append(p)

work_dir = mkdir('exp', 'brs')
monitor_dir = mkdir(work_dir, 'monitor')
env = gym.make(hp.env_name)
if RENDER:
  env.render(mode = "human")
if MOVIE:
  env = wrappers.Monitor(env, monitor_dir, force = True)
nb_inputs = env.observation_space.shape[0]
nb_outputs = env.action_space.shape[0]

policy = Policy(nb_inputs, nb_outputs,hp.env_name)
normalizer = Normalizer(nb_inputs)

if LOAD_POLICY_FILENAME != None:
    try:
        f = open(LOAD_POLICY_FILENAME, "rb")
        (policy, normalizer, _) = pickle.load(f)
    except:
        print("Failed to read policy data:", LOAD_POLICY_FILENAME)



log_file = open(LOG_FILENAME, 'w')
log_file.write("steps\tscore\tsigma\n")

print("start training")
train(env, policy, normalizer, hp, parentPipes)

if MULTI_PROCESSING:
  for parentPipe in parentPipes:
    parentPipe.send([_CLOSE,"pay2"])

  for p in processes:
    p.join()
    
log_file.close()

In [None]:

from IPython import display
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline



In [None]:
#LOAD_POLICY_FILENAME = f'{LOGDIR}/kai_policy_HalfCheetahBulletEnv-v0_3000_979.pkl'
LOAD_POLICY_FILENAME = f'{LOGDIR}/kai_policy_HalfCheetahBulletEnv-v0_10000_989.pkl'

try:
    f = open(LOAD_POLICY_FILENAME, "rb")
    (policy, normalizer, _) = pickle.load(f)
except:
    print("Failed to read policy data:", LOAD_POLICY_FILENAME)

env = gym.make(hp.env_name)

# Kai - optionally reset the normalizer. Seems to have a harder time at the beginning
# nb_inputs = env.observation_space.shape[0]
# normalizer = Normalizer(nb_inputs)

state = env.reset()
img = plt.imshow(env.render(mode='rgb_array')) # only call this once
for _ in range(1000):
    img.set_data(env.render(mode='rgb_array')) # just update the data
    display.display(plt.gcf())
    display.clear_output(wait=True)

    normalizer.observe(state)
    state = normalizer.normalize(state)
    action = policy.evaluate(state, None, None, hp)
    state, reward, done, _ = env.step(action)