In [None]:
import numpy as np
import lqg1d
import matplotlib.pyplot as plt
import utils
from tqdm import tqdm

In [None]:
class ConstantStep(object):
    def __init__(self, learning_rate):
        self.learning_rate = learning_rate

    def update(self, gt):
        return self.learning_rate * gt

In [None]:
class AdamStep(object) :
    
    def __init__(self, learning_rate) :
        self.m = 0
        self.v = 0
        self.learning_rate = learning_rate
        
    def update(self, g, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8, alpha = 0.01) :
        self.m = beta1*self.m + (1-beta1)*g
        self.v = beta2*self.v + (1-beta2)*np.multiply(g,g)
        m_hat = self.m / (1 - beta1)
        v_hat = self.v / (1 - beta1)
        delta_g = alpha * m_hat / (np.sqrt(v_hat) + epsilon)

        return delta_g   

In [None]:
class GaussianPolicy:
    def __init__(self, theta, sigma):
        self.theta = theta
        self.sigma = sigma
        
    def draw_action(self, s):
        return np.random.normal(s*self.theta, self.sigma)
    
    def gradient_log(self, a, s):
        d_theta = (a-s*self.theta)/(self.sigma**2)*s
        #print("gradient_log")
        #print(a, s)
         #print(self.theta)
        #print((a-s*self.theta))
        #print((a-s*self.theta)/(self.sigma**2)*s)
        d_sigma = ((a-s*self.theta)**2 - self.sigma**2)/(self.sigma**3)
        return np.array([d_theta, d_sigma])
    
    def update(self, theta, sigma):
        self.theta = theta
        self.sigma = sigma

In [None]:
def estimate_gradient(paths, policy, discount):
    """
    return np.mean(
        [np.sum(
            [path["rewards"][i]*(discount**i) for i in range(len(path["rewards"]))]
            ) * np.sum(
            [policy.gradient_log(path["actions"][i], path["states"][i]) for i in range(len(path["actions"]))]
            , 0) for path in paths
        ]
        , 0)
    """
    return np.mean([np.sum( [policy.gradient_log(paths[n]['actions'][t],paths[n]['states'][t])[0] * np.sum([paths[n]['rewards'][r]*(discount**r) for r in range(len(paths[n]['states']))])  for t in range(0,len(paths[n]['states']))]) for n in range(0,N)]), 0

    """
    d_theta = 0
    d_sigma = 0
    for path in paths:
        R = 0
        for i in range(1,1+len(path["rewards"])):
            R = discount*R + path["rewards"][-i]
        for i in range(len(path["states"])):
            d_t, d_s = R*policy.gradient_log(path["actions"][i], path["states"][i])
            #print(policy.gradient_log(path["actions"][i], path["states"][i]))
            #print(path["actions"][i], path["states"][i])
            d_theta += d_t
            d_sigma += d_s
            
    return d_theta/len(paths), d_sigma/(len(paths))
    """
    

In [None]:
#####################################################
# Define the environment and the policy
#####################################################
env = lqg1d.LQG1D(initial_state_type='random')

In [None]:
#####################################################
# Experiments parameters
#####################################################
# We will collect N trajectories per iteration
N = 100
# Each trajectory will have at most T time steps
T = 100
# Number of policy parameters updates
n_itr = 100
# Set the discount factor for the problem
discount = 0.9
# Learning rate for the gradient update
learning_rate = 0.1

In [None]:
policy = GaussianPolicy(-1, 0.5)

#####################################################
# define the update rule (stepper)
stepper =  ConstantStep(learning_rate) # e.g., constant, adam or anything you want
adam_stepper = AdamStep(learning_rate) # e.g., constant, adam or anything you want


# fill the following part of the code with
#  - REINFORCE estimate i.e. gradient estimate
#  - update of policy parameters using the steppers
#  - average performance per iteration
#  - distance between optimal mean parameter and the one at it k
mean_parameters = []
avg_return = []
all_theta = []
for _ in tqdm(range(n_itr), desc="Simulating"):
    paths = utils.collect_episodes(env, policy=policy, horizon=T, n_episodes=N)
    d_theta, d_sigma = estimate_gradient(paths, policy, discount)
    print(policy.theta, d_theta, d_sigma)
    # sigma remains constant
    policy.update(policy.theta+adam_stepper.update(d_theta), policy.sigma)
    all_theta.append(policy.theta)


In [None]:
plt.figure(1)
plt.plot(all_theta)
plt.show()

In [None]:
print(policy.theta, policy.sigma)

In [None]:
paths = utils.collect_episodes(env, policy=policy, horizon=T, n_episodes=N)
d_theta, d_sigma = estimate_gradient(paths, policy, discount)

print(d_theta, d_sigma)

In [None]:
policy.update(policy.theta+stepper.update(d_theta), policy.sigma)
print(policy.theta, policy.sigma)

In [None]:
stepper.update(d_theta)

In [None]:
d_theta

In [None]:
policy_best = GaussianPolicy(-0.69, 0.5)
paths = utils.collect_episodes(env, policy=policy_best, horizon=T, n_episodes=N)
d_theta, d_sigma = estimate_gradient(paths, policy_best, discount)
print(d_theta, d_sigma)

In [None]:
paths = utils.collect_episodes(env, policy=policy, horizon=T, n_episodes=N)
paths_n = utils.collect_episodes(env, policy=GaussianPolicy(-0.69,0.5), horizon=T, n_episodes=N)

In [None]:
np.mean([sum([path["rewards"][i]*discount**i for i in range(len(path["rewards"]))]) for path in paths_n])

In [None]:
# plot the average return obtained by simulating the policy
# at each iteration of the algorithm (this is a rought estimate
# of the performance
plt.figure()
plt.plot(avg_return)

In [None]:
# plot the distance mean parameter
# of iteration k
plt.figure()
plt.plot(mean_parameters)

In [None]:
policy = GaussianPolicy(1, 0.5)


In [None]:
estimate_gradient(paths, GaussianPolicy(1, 0.5), discount)

In [None]:
policy.draw_action(-498.98726007)

In [None]:
policy.theta

In [None]:
policy.sigma

In [None]:
paths = utils.collect_episodes(env, policy=policy, horizon=T, n_episodes=N)


In [None]:
paths[0]["states"][30]

In [None]:
-500*(-499.91451651 + 500)/(0.5**2)

In [None]:
policy.gradient_log(-499.91, -500)

In [None]:
policy.gradient_log(-500.90692724,-500.)