In [1]:
import numpy as np
import matplotlib.pyplot as plt
import time
import seaborn as sns
%matplotlib inline

In [2]:
import gym
env = gym.make('CartPole-v1')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [3]:
obs = env.reset()
num_params = len(obs)

In [4]:
def evaluate(W):
    X = env.reset()
    for t in range(1,201):
        action = 0 if W@X < 0 else 1
        X, reward, done, _ = env.step(action)
        if done:
            return t
    return t

In [5]:
class simpleES():
    def __init__(self, popsize=256):
        self.popsize = popsize
        self.mu = np.random.normal(0,1,4)
        self.cov = np.full((4,4), 0.5)
    
    def ask(self):
        self.sols = np.random.multivariate_normal(self.mu, self.cov, self.popsize)
        return self.sols
        
    def tell(self, fit_list):
        self.fit = fit_list
        self.bst_i = np.argmax(self.fit)
        
    def result(self):
        self.mu = self.sols[self.bst_i]
        return self.mu, self.fit[self.bst_i]
        
        

In [6]:
w = np.zeros((25, 4))

In [7]:
idx = np.random.choice(25)

In [8]:
np.where(np.random.rand(w[idx].size)>0.5)

(array([1, 2, 3]),)

In [9]:
class SimpleGA:
    """Simple Genetic Algorithm"""
    def __init__(self, num_params, 
                 popsize=256, 
                 sig_init=0.1, 
                 sig_decay=0.999, 
                 sig_lim=0.01,
                 elite_ratio=0.1,
                 w_decay=0.1, 
                 forget_best=False):
        
        self.num_params = num_params
        self.popsize = popsize
        self.sig_init = sig_init
        self.sig_decay = sig_decay
        self.sig_lim = sig_lim
        self.elite_ratio = elite_ratio
        self.w_decay = w_decay
        self.first_gen = True
        self.forget_best = forget_best
        self.sig = self.sig_init
        
        self.elite_popsize = int(self.popsize*self.elite_ratio)
        self.elite_w = np.zeros((self.elite_popsize, self.num_params))
        self.elite_r = np.zeros(self.elite_popsize)
        self.best_params = np.zeros(self.num_params)
        self.bst_r = 0
        
    def ask(self):
        self.noise = np.random.randn(self.popsize, self.num_params)*self.sig
        solutions = []
        
        
        def mate(a, b):
            # create a random sample for size c with uniform distribution
            # c.size = number of parameters (i.e. features/observations)
            # select a random number the indices from this uniform distribution
            c = np.copy(a)
            idx = np.where(np.random.rand((c.size))>0.5)
            c[idx] = b[idx] # create population with parameters selected randomly from both parents 
            return c
        
        for i in range(self.popsize):
            idx_a = np.random.choice(self.elite_popsize) # get random idx
            idx_b = np.random.choice(self.elite_popsize) # get random idx
            # get a child by mating two parameters with random probability
            child = mate(self.elite_w[idx_a], self.elite_w[idx_b])
            solutions.append(child + self.noise[i]) # add noise to params
        
        # convert the list to numpy array
        solutions = np.array(solutions)
        self.solutions = solutions
        return solutions
        
    def tell(self, reward_list):
        # assert that we have rewards for every population
        assert (len(reward_list) == self.popsize), "Incosistant reward size"
        r_list = reward_list
        
        
        if self.forget_best or self.first_gen:
            r = r_list
            soln = self.solutions
        
        
        else: 
            # add new rewards & solns to best from last genenrations.
            r = np.concatenate([r_list,  self.elite_r])
            soln = np.concatenate([self.solutions, self.elite_w])
        
        # get the indices for population with best rewards (elite population)
        idx = r.argsort()[::-1][0:self.elite_popsize]
        self.elite_r = r[idx]
        self.elite_w = soln[idx]
        
        # best reward for this interation
        self.bst_r_ = self.elite_r[0]
        
        if self.first_gen or (self.bst_r_ > self.bst_r):
            self.first_gen = False
            self.best_params = np.copy(self.elite_w[0])
            self.bst_r = self.elite_r[0]
        
        if self.sig > self.sig_lim:
            self.sig *= self.sig_decay
    
    def result(self):
        return self.best_params, self.bst_r, self.bst_r_, self.sig
        
        

In [10]:
class SimpleNES():
    "Simple/bare bone Natural Evolution Strategies"
    def __init__(self, num_params, 
                 popsize=256, 
                 sig_init=0.1, 
                 sig_decay=0.999, 
                 sig_lim=0.01, 
                 alpha = 0.1):
        
        self.num_params = num_params
        self.popsize = popsize
        self.sig_init = sig_init
        self.sig_decay = sig_decay
        self.sig_lim = sig_lim
        self.first_gen = True
        self.sig = self.sig_init
        self.alpha = alpha
        
        self.solutions = np.random.randn(self.popsize, self.num_params)
        self.best_s = np.zeros(self.num_params)
        self.bst_r = 0
        
    def ask(self):
        self.noise = np.random.randn(self.popsize, self.num_params)*self.sig
        solutions = self.solutions + self.noise
        self.solutions = solutions
        return solutions
        
    def tell(self, reward_list):
        assert (len(reward_list) == self.popsize), "Inconsistant reward size"
        
        idx = np.argmax(reward_list)
        self.best_r_ = reward_list[idx]
        self.best_s = self.solutions[idx]
        
        self.r = (reward_list - np.mean(reward_list))/np.std(reward_list)
        self.solutions = self.solutions + self.alpha / (self.popsize*self.sig) * np.dot(self.noise.T, self.r)
        
        if self.first_gen or (self.bst_r_ > self.bst_r):
            self.first_gen = False
            self.best_r = self.best_r_
        
        if self.sig > self.sig_lim:
            self.sig *= self.sig_decay
            
    def result(self):
        return self.best_s, self.best_r, self.best_r_, self.sig
        

In [11]:
MY_REQUIRED_FITNESS = 199

In [12]:
solver = SimpleGA(num_params)
e = 0
while e < 100:

    # ask the ES to give us a set of candidate solutions
    solutions = solver.ask()

    # create an array to hold the fitness results.
    fitness_list = np.zeros(solver.popsize)

    # evaluate the fitness for each given solution.
    for i in range(solver.popsize):
        fitness_list[i] = evaluate(solutions[i])

    # give list of fitness results back to ES
    solver.tell(fitness_list)

    # get best parameter, fitness from ES
    best_solution, best_fitness_ever, best_fitness_current, sigma = solver.result()
    e += 1
    print(e, best_fitness_ever, best_fitness_current)
    #print (best_fitness)
    if best_fitness_ever > MY_REQUIRED_FITNESS:
        break

1 200.0 200.0
