In [8]:
import numpy as np
import matplotlib.pyplot as plt
import time
import seaborn as sns
%matplotlib inline

In [1]:
# import
import gym
env = gym.make('CartPole-v1')

In [2]:
obs = env.reset()
num_params = len(obs)

In [3]:
# this is how we evaluate the function we're using and returns a fitness score
# once the pole falls to a certain angle, then you'll fail

def evaluate(W):
    X = env.reset()
    for t in range(1,201):
        action = 0 if W@X < 0 else 1
        X, reward, done, _ = env.step(action)
        if done:
            return t
    return t

In [6]:
"""Simple Evolution strategies"""

# let's create a class called SimpleES
class SimpleES():

    # we're going to initialise the class with the following variables for each object
    def __init__(self, popsize=256):
        self.popsize = popsize  
        self.mu = np.random.normal(0,1,4)  # this selects 4 random variables from a normal distribution centred around 0 with a spread of 1 
        self.cov = np.full((4,4), 0.5)  # creates a 4x4 matrix, filled with values of 0.5
        self.best_s = self.mu  # best_s is reassigned to mu
        self.best_r = 0  
        self.first_gen = True
    
    def ask(self):
        # this draws random samples from a multivariate normal distribution (ie a Gaussian distribution in multi-dimensions) 
        # our problem is 4D. So we provide 4 random means. And provide covariance matrix 4x4
        # outputs an array of randomly sampled numbers from a multiD gaussian distribution, length popsize
        # these are our weights?
        self.sols = np.random.multivariate_normal(self.mu, self.cov, self.popsize)
        return self.sols
        pass
        
    def tell(self, fit_list):
        # I think this is what we want to maximise?
        self.fit = fit_list
        self.best_i = np.argmax(self.fit)  # finds the index of the max value in the fit_list array
        self.best_s = self.sols[self.best_i]  # collects the (randomly generated gaussian) value of our sols array in the same index 
        self.best_r_ = self.fit[self.best_i]  # colects the value of our fit_list array in the same index
        
        # on the first go, it sets first_gen to False and best_r to value found from our fit array
        # on the next go, if our newly selected value from the fit array is bigger than the last, then it is updated
        if self.first_gen or (self.best_r < self.best_r_): 
            self.first_gen = False
            self.best_r = self.best_r_
        
        # mu is set to be the the value in the same index from our sols array
        self.mu = self.best_s
        
        pass
    
    def result(self):
        # return the best fitness and solution
        # here we're returning the best weight, and the max value on the array that we got to
        return self.best_s, self.best_r, self.best_r_, None
        pass
        

In [7]:
class SimpleGA:
    """Simple Genetic Algorithm"""
    def __init__(self, num_params, # Number of input features
                 popsize=256,      # Number of sols that we want to generate 
                 sig_init=0.1,     # Std deviation 
                 sig_decay=0.999,  # Rate of decay for std deviation
                 sig_lim=0.01,     # Min limit when to stop the decay
                 elite_ratio=0.1,  # Elite popuation % to keep
                 w_decay=0.1,      
                 forget_best=False):
        
        self.num_params = num_params
        self.popsize = popsize
        self.sig_init = sig_init
        self.sig_decay = sig_decay
        self.sig_lim = sig_lim
        self.elite_ratio = elite_ratio
        self.w_decay = w_decay
        self.first_gen = True
        self.forget_best = forget_best
        self.sig = self.sig_init
        
        # Initiate the size of elite population (total best sols to keep)
        self.elite_popsize = int(self.popsize*self.elite_ratio)
        # Initiate weights for best sols
        self.elite_w = np.zeros((self.elite_popsize, self.num_params))
        # Initiate fitness for best sols
        self.elite_r = np.zeros(self.elite_popsize)
        # Initiate parameters for best solution
        self.best_s = np.zeros(self.num_params)
        # Initiate best reward
        self.best_r = 0
        
    def ask(self):
        # Gaussian noise to be added after random recombination of bst sols (mating)
        self.noise = np.random.randn(self.popsize, self.num_params)*self.sig
        solutions = []
        
        ### --- TO DO --- ###
        # Mating: generate a new solution by
        # randomly combining parameters from two different parent(elite) solutions 
        # add gaussian noise to the newly generated solution
        # Hint1: np.random.rand
        # Hint2: np.random.choice
        # Hint3: Creating a separate 'mate' function might help
        
        ### --- TO DO --- ###
        
        # convert the list to numpy array
        solutions = np.array(solutions)
        self.solutions = solutions
        return solutions
        
    def tell(self, reward_list):
        # assert that we have reward for every solution
        assert (len(reward_list) == self.popsize), "Incosistant reward size"
        r_list = reward_list    
        
        if self.forget_best or self.first_gen:
            r = r_list
            soln = self.solutions
        else: 
            # add new rewards & solns to best from last genenrations.
            r = np.concatenate([r_list,  self.elite_r])
            soln = np.concatenate([self.solutions, self.elite_w])
        
        # get the indices for population with best rewards (elite population)
        idx = r.argsort()[::-1][0:self.elite_popsize]
        self.elite_r = r[idx]
        self.elite_w = soln[idx]
        
        # best reward for this interation
        self.best_r_ = self.elite_r[0]
        
        if self.first_gen or (self.best_r_ > self.best_r):
            self.first_gen = False
            self.best_s = np.copy(self.elite_w[0])
            self.best_r = self.elite_r[0]
        
        if self.sig > self.sig_lim:
            self.sig *= self.sig_decay
    
    def result(self):
        return self.best_s, self.best_r, self.best_r_, self.sig
        
        

In [8]:
class SimpleNES():
    """Simple Natural Evolution Strategies"""
    def __init__(self, num_params, 
                 popsize=256, 
                 sig_init=0.1, 
                 sig_decay=0.999, 
                 sig_lim=0.01, 
                 alpha = 0.1):
        
        self.num_params = num_params
        self.popsize = popsize
        self.sig_init = sig_init
        self.sig_decay = sig_decay
        self.sig_lim = sig_lim
        self.first_gen = True
        self.sig = self.sig_init
        self.alpha = alpha
        
        # Initialise the memory for solutions and best solution
        self.solutions = np.random.randn(self.popsize, self.num_params)
        self.best_s = np.zeros(self.num_params)
        self.best_r = 0
        
    def ask(self):
        ### --- TO DO --- ###
        # Return array of solutions with guasina noise added to it.
        pass
        #return solutions
        ### --- TO DO --- ###
        
    def tell(self, reward_list):
        assert (len(reward_list) == self.popsize), "Inconsistant reward size"
        
        ### --- TO DO --- ###
        # Update best current reward and best solution
        # ---idx = 
        # ---self.best_r_ = 
        # ---self.best_s = 
        
        # Normalise the reward to gaussian distribution
        # ---self.r = (reward - mean(reward)) / std(reward)
        
        # Perform the parameter update (SGD)
        # ---delta = Noise weighted by respective reward (Hint: np.dot/@ and noise.T)
        # ---j_theta = (alpha / num_solutions*sigma) * delta 
        # --- solutions = solution + j_theta
        ### --- TO DO --- ###
        
        if self.first_gen or (self.best_r_ > self.best_r):
            self.first_gen = False
            self.best_r = self.best_r_
        
        if self.sig > self.sig_lim:
            self.sig *= self.sig_decay
            
    def result(self):
        return self.best_s, self.best_r, self.best_r_, self.sig
        

In [10]:
MY_REQUIRED_FITNESS = 199

In [21]:
solver = SimpleES(num_params)
e = 0
while e < 100:

    # ask the ES to give us a set of candidate solutions
    solutions = solver.ask()

    # create an array to hold the fitness results.
    fitness_list = np.zeros(solver.popsize)

    # evaluate the fitness for each given solution.
    for i in range(solver.popsize):
        fitness_list[i] = evaluate(solutions[i])

    # give list of fitness results back to ES
    solver.tell(fitness_list)

    # get best parameter, fitness from ES
    best_solution, best_fitness_ever, best_fitness_current, sigma = solver.result()
    e += 1
    print(e, best_fitness_ever, best_fitness_current)
    #print (best_fitness)
    if best_fitness_ever > MY_REQUIRED_FITNESS:
        break

1 11.0 11.0
2 11.0 10.0
3 11.0 10.0
4 11.0 10.0
5 11.0 10.0
6 11.0 10.0
7 11.0 11.0
8 11.0 10.0
9 11.0 11.0
10 11.0 11.0
11 11.0 10.0
12 11.0 10.0
13 11.0 10.0
14 11.0 10.0
15 11.0 11.0
16 11.0 10.0
17 11.0 10.0
18 11.0 11.0
19 11.0 10.0
20 11.0 11.0
21 11.0 10.0
22 11.0 10.0
23 11.0 10.0
24 11.0 10.0
25 11.0 10.0
26 11.0 10.0
27 11.0 11.0
28 11.0 10.0
29 11.0 10.0
30 11.0 10.0
31 11.0 10.0
32 11.0 10.0
33 11.0 10.0
34 11.0 10.0
35 11.0 10.0
36 11.0 10.0
37 11.0 11.0
38 11.0 10.0
39 11.0 11.0
40 11.0 10.0
41 11.0 10.0
42 11.0 10.0
43 11.0 10.0
44 11.0 10.0
45 11.0 10.0
46 11.0 10.0
47 11.0 10.0
48 11.0 9.0
49 11.0 10.0
50 11.0 10.0
51 11.0 10.0
52 11.0 10.0
53 11.0 10.0
54 11.0 9.0
55 11.0 10.0
56 11.0 11.0
57 11.0 11.0
58 11.0 10.0
59 11.0 10.0
60 11.0 10.0
61 11.0 10.0
62 11.0 11.0
63 11.0 11.0
64 11.0 11.0
65 11.0 11.0
66 11.0 11.0
67 11.0 10.0
68 11.0 10.0
69 11.0 10.0
70 11.0 10.0
71 11.0 11.0
72 11.0 10.0
73 11.0 10.0
74 11.0 10.0
75 11.0 11.0
76 11.0 10.0
77 11.0 11.0
78 11.0 10

In [22]:
import gym
env = gym.make('CartPole-v0')
x = env.reset()
W = [ 0.00543876, -0.0581728 ,  0.08738324,  0.10999395]
for _ in range(1000):
    env.render()
    action = 0 if W@x < 0 else 1
    #action = env.action_space.sample()
    x, r, d, _ = env.step(action)

NotImplementedError: abstract