In [None]:
import numpy as np
import lqg1d
import matplotlib.pyplot as plt
import utils
from tqdm import tqdm

In [None]:
class ConstantStep(object):
    def __init__(self, learning_rate):
        self.learning_rate = learning_rate

    def update(self, gt):
        return self.learning_rate * gt

In [None]:
class NormalizedConstantStep(object) :  
    def __init__(self, learning_rate) :
        self.learning_rate = learning_rate
        
    def update(self, g) :
        delta_g = self.learning_rate/np.linalg.norm(g) * g
        return delta_g   

In [None]:
class AdamStep(object) :
    
    def __init__(self, learning_rate, beta1 = 0.9, beta2 = 0.999, epsilon = 10**(-8)) :
        self.m = 0
        self.v = 0
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.beta1_power = beta1
        self.beta2_power = beta2
        
    def update(self, g) :
        self.m = self.beta1*self.m + (1-self.beta1)*g
        self.v = self.beta2*self.v + (1-self.beta2)*np.multiply(g,g)
        m_hat = self.m / (1 - self.beta1_power)
        v_hat = self.v / (1 - self.beta2_power)
        delta_g = self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)
        self.beta1_power *= self.beta1
        self.beta2_power *= self.beta2
        return delta_g   

In [None]:
class GaussianPolicy:
    def __init__(self, theta, sigma):
        self.theta = theta
        self.sigma = sigma
        
    def draw_action(self, s):
        return np.random.normal(s*self.theta, self.sigma)
    
    def gradient_log(self, a, s):
        d_theta = (a-s*self.theta)/(self.sigma**2)*s
        d_sigma = ((a-s*self.theta)**2 - self.sigma**2)/(self.sigma**3)
        return np.array([d_theta, d_sigma])
    
    def update(self, theta, sigma):
        self.theta = theta
        self.sigma = sigma

In [None]:
def estimate_gradient(paths, policy, discount):
    return np.mean([
        np.sum([
            policy.gradient_log(path['actions'][t], path['states'][t]) for t in range(len(path['states']))],0)*
             np.sum([
                path['rewards'][r]*discount**r for r in range(len(path['states']))
            ])
        for path in paths
    ],0)
    return np.mean([
        np.sum( [
            policy.gradient_log(paths[n]['actions'][t],paths[n]['states'][t])[0]
            for t in range(0,len(paths[n]['states']))])* 
            np.sum([
                paths[n]['rewards'][r]*discount**r for r in range(len(paths[n]['states']))
            ])
        for n in range(0,N)]), 0

In [None]:
#####################################################
# Define the environment and the policy
#####################################################
env = lqg1d.LQG1D(initial_state_type='random')

In [None]:
#####################################################
# Experiments parameters
#####################################################
# We will collect N trajectories per iteration
N = 1
# Each trajectory will have at most T time steps
T = 100
# Number of policy parameters updates
n_itr = 50000
# Set the discount factor for the problem
discount = 0.9
# Learning rate for the gradient update
learning_rate = 0.0000001

In [None]:
policy = GaussianPolicy(-0.1, 0.5)

#####################################################
# define the update rule (stepper)
stepper =  ConstantStep(learning_rate) # e.g., constant, adam or anything you want
adam_stepper = AdamStep() # e.g., constant, adam or anything you want
normalized_stepper = NormalizedConstantStep(learning_rate)


# fill the following part of the code with
#  - REINFORCE estimate i.e. gradient estimate
#  - update of policy parameters using the steppers
#  - average performance per iteration
#  - distance between optimal mean parameter and the one at it k
mean_parameters = []
avg_return = []
all_theta = [policy.theta]
for i in tqdm(range(n_itr), desc="Simulating"):
    paths = utils.collect_episodes(env, policy=policy, horizon=T, n_episodes=N)
    d_theta, d_sigma = estimate_gradient(paths, policy, discount)
    # sigma remains constant
    policy.update(policy.theta+adam_stepper.update(d_theta), policy.sigma)
    all_theta.append(policy.theta)
    avg_return.append(np.mean([np.sum([
                paths[n]['rewards'][r]*(discount**r) for r in range(len(paths[n]['states']))
            ]) for n in range(0,N)]))


In [None]:
plt.figure(1)
plt.plot(all_theta)
plt.show()

In [None]:
plt.figure(1)
plt.plot(all_theta)
plt.show()

In [None]:
# plot the average return obtained by simulating the policy
# at each iteration of the algorithm (this is a rought estimate
# of the performance
plt.figure()
plt.plot(avg_return)

In [None]:
# plot the distance mean parameter
# of iteration k
plt.figure()
plt.plot(mean_parameters)