In [1]:
import numpy as np
import gym
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
ENV_NAME = 'BipedalWalker-v2'
env = gym.make(ENV_NAME)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [3]:
class Hp():
    # Hyperparameters
    def __init__(self,
                 episode_length=2000,
                 learning_rate=0.02,
                 num_deltas=16,
                 num_best_deltas=16,
                 noise=0.03,
                 seed=1,
                 env_name='BipedalWalker-v2',
                 record_every=50):
        
        self.episode_length = episode_length
        self.learning_rate = learning_rate
        self.num_deltas = num_deltas
        self.num_best_deltas = num_best_deltas
        assert self.num_best_deltas <= self.num_deltas
        self.noise = noise
        self.seed = seed
        self.env_name = env_name
        self.record_every = record_every

In [4]:
class Normalizer():
    # Normalizes the inputs
    def __init__(self, nb_inputs):
        self.n = 0
        self.M = np.zeros(nb_inputs)
        self.S = np.zeros(nb_inputs)

    def update(self, x):
        self.n += 1.0
        M_ = self.M + (x - self.M) / self.n
        self.S += (x - self.M) * (x - M_)
        self.M = M_
        
        return self.normalized_x(x)
    
    @property
    def mean(self):
        return self.M
    
    @property
    def var(self):
        return self.S/self.n if self.n >1 else 0
    
    @property
    def std(self):
        return np.sqrt(self.var)

    def normalized_x(self, x):
        return (x - self.M) / (self.std.clip(min=1e-2))

In [6]:
class PerceptronModel():
    def __init__(self, x_size, action_size, learning_rate = 0.1, number_of_deltas=10):
        self.W = np.zeros((action_size, x_size))
        self.learning_rate = learning_rate
        self.number_of_deltas=1

    def predict(self,x):
        return self.W.dot(x)

    def sample_deltas(self):
        return np.random.randn(self.number_of_deltas,*self.W.shape)

    def update(self, rollouts, sigma_rewards):
        # sigma_rewards is the standard deviation of the rewards
        r_pos, r_neg,deltas = rollouts
        step = np.average((r_pos-r_net),axis=0,weights=deltas)
        self.W += self.learning_rate * step/sigma_rewards

In [3]:
class ArsTrainer():
    def __init__(self,
                 hp=None,
                 input_size=None,
                 output_size=None,
                 normalizer=None,
                 model=None,
                 monitor_dir=None):

        self.hp = hp or Hp()
        np.random.seed(self.hp.seed)
        self.env = gym.make(self.hp.env_name)
        if monitor_dir is not None:
            should_record = lambda i: self.record_video
            self.env = wrappers.Monitor(self.env, monitor_dir, video_callable=should_record, force=True)
        self.hp.episode_length = self.env.spec.timestep_limit or self.hp.episode_length
        self.input_size = input_size or self.env.observation_space.shape[0]
        self.output_size = output_size or self.env.action_space.shape[0]
        self.normalizer = normalizer or Normalizer(self.input_size)
        self.model = model or PerceptronModel(self.input_size, self.output_size, self.hp)
        self.record_video = False

    # Evaluate the model 
    def evaluate(self):
        state = self.env.reset()
        done = False
        num_plays = 0.0
        sum_rewards = 0.0
        while not done and num_plays < self.hp.episode_length:
            normalized_state = self.normalizer.normalized_x(state)
            action = self.model.predict(state, delta)
            state, reward, done, _ = self.env.step(action)
            reward = max(min(reward, 1), -1)
            sum_rewards += reward
            num_plays += 1
        return sum_rewards
    
    # Explore the model
    def explore(self, delta=None):
        
        # explore positive direction
        state = self.env.reset()
        done = False
        num_plays = 0.0
        sum_rewards_pos = 0.0
        while not done and num_plays < self.hp.episode_length:
            normalized_state = self.normalizer.update(state)
            action = self.model.predict(normalized_state) + delta * self.hp.noise
            state, reward, done, _ = self.env.step(action)
            reward = max(min(reward, 1), -1)
            sum_rewards_pos += reward
            num_plays += 1
        
        # explore negative direction
        state = self.env.reset()
        done = False
        num_plays = 0.0
        sum_rewards_neg = 0.0
        while not done and num_plays < self.hp.episode_length:
            normalized_state = self.normalizer.update(state)
            action = self.model.predict(normalized_state) - delta * self.hp.noise
            state, reward, done, _ = self.env.step(action)
            sum_rewards_neg += np.clip(reward,a_min=-1,a_max=1)
            num_plays += 1
        
        return sum_rewards_pos,sum_rewards_neg

    def train(self):
        for eposide in range(self.hp.episode_length):
            # initialize the random noise deltas and the positive/negative rewards
            deltas = np.random.randn(self.hp.num_deltas,*self.model.W.shape)
            positive_rewards = np.zeros(self.hp.num_deltas,dtype=np.float)
            negative_rewards = np.zeros(self.hp.num_deltas,dtype=np.float)

            # play an episode each with positive deltas and negative deltas, collect rewards
            # TODO: paralellize the exploration
            for i in range(self.hp.num_deltas):
                positive_rewards[i],negative_rewards[i] = self.explore(delta=deltas[i])
                
            # Compute the standard deviation of all rewards
            reward_std = np.hstack([positive_rewards, negative_rewards]).std()
            
            # Sort the rollouts by the max(r_pos, r_neg) and select the deltas with best rewards
            reward_maximum = np.maximum(positive_rewards,negative_rewards)
            idx = np.argsort(reward_maximum)
            n = self.hp.num_best_deltas
            rollouts = positive_rewards[idx[-n:]],negative_rewards[idx[-n:]],deltas[idx[-n:]]
            self.model.update(rollouts,reward_std)
            
        
            # Only record video during evaluation, every n steps
            if eposide % self.hp.record_every == 0:
                self.record_video = True
            # Play an episode with the new weights and print the score
            reward_evaluation = self.evaluate()
            print('Eposiode: ', eposide, 'Reward: ', reward_evaluation)
            self.record_video = False


def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path

# Main code
if __name__ == '__main__':
    videos_dir = mkdir('.', 'videos')
    monitor_dir = mkdir(videos_dir, ENV_NAME)
    hp = Hp(seed=1946, env_name=ENV_NAME)
    trainer = ArsTrainer(hp=hp, monitor_dir=monitor_dir)
    trainer.train()

array([ 2.74745352e-03, -9.96619696e-06,  7.75269549e-04, -1.59999323e-02,
        9.20130238e-02, -1.02308916e-03,  8.60239461e-01,  2.15908761e-03,
        1.00000000e+00,  3.24187465e-02, -1.02302199e-03,  8.53788137e-01,
        7.23540550e-04,  1.00000000e+00,  4.40814018e-01,  4.45820123e-01,
        4.61422771e-01,  4.89550203e-01,  5.34102798e-01,  6.02461040e-01,
        7.09148884e-01,  8.85931849e-01,  1.00000000e+00,  1.00000000e+00])

In [24]:
class SampleNormalizer():
    # Normalizes the inputs
    def __init__(self, nb_inputs):
        self.n = 0
        self.mu = np.zeros(nb_inputs)
        self.S = np.zeros(nb_inputs)

    def update(self, x):
        self.n += 1.0
        mu_ = self.mu + (x - self.mu) / self.n
        self.S += (x - self.mu) * (x - mu_)
        self.mu = mu_

        return self.normalized_x(x)

    @property
    def mean(self):
        return self.mu

    @property
    def var(self):
        return self.S / self.n if self.n > 1 else 0

    @property
    def std(self):
        return np.sqrt(self.var)

    def normalized_x(self, x):
        return (x - self.mu) / (self.std.clip(min=1e-2))

In [61]:
sampler = SampleNormalizer(2)

In [72]:
a = [sampler.update([np.random.normal(15,3),np.random.normal(-100,20)]) for _ in range(1000)]

In [73]:
sampler.mean,sampler.std,sampler.var

(array([  15.00099827, -100.0643635 ]),
 array([ 2.93004598, 19.89156019]),
 array([  8.58516946, 395.67416661]))

In [78]:
class PerceptronModel():
    def __init__(self, x_size, action_size, learning_rate = 0.1):
        self.W = np.zeros((action_size, x_size))
        self.learning_rate = learning_rate

    def predict(self,x):
        return self.W.dot(x)

    def update(self, rollouts, reward_std):
        # sigma_rewards is the standard deviation of the rewards
        r_pos, r_neg, deltas = rollouts
        step = np.average(deltas,axis=0, weights=(r_pos-r_neg))
        self.W += self.learning_rate * step/reward_std

In [79]:
model = PerceptronModel(5,3)

In [98]:
model.predict(np.array([1,2,3,4,5]))

array([-7.14644458, -3.99051403,  1.59424655])

In [96]:
rollouts = np.random.randn(10),np.random.randn(10),np.random.randn(10,3,5)

In [92]:
np.average(rollouts[2],axis=0,weights=r_delta)

array([[-2.02016838, -0.3448374 , -1.20884074],
       [-2.87217114, -1.98339329,  0.36971208],
       [-2.3147062 , -1.77905332, -1.76713383],
       [-1.70359096, -2.21546607,  0.9282644 ],
       [-4.44653005, -6.61979176, -4.68115183]])

In [90]:
r_delta= rollouts[0] - rollouts[1]

In [97]:
model.update(rollouts,0.1)

In [101]:
env.observation_space.shape[0],env.observation_space

(24, Box(24,))

In [104]:
max(min(-3,1),-1)

-1

In [105]:
np.clip(-3,-1,1)

-1