In [2]:
import os
path = os.environ['MyNN']
os.sys.path.append(path)
import MyNN
import gym
import numpy as np

In [155]:
class ACAgent:
    def __init__(self, state_size, action_size, variances, gamma=0.95, lam=0.98,
                 actor_lr=0.001, critic_lr=0.001, method='PPO_CEM', w=1):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  # discount rate
        self.lam = lam # GAE disc rate
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.method = method
        self.actor = self._build_model()
        self.critic = self._build_vf()
        self.actor.lr = actor_lr
        self.critic.lr = critic_lr
        self.scaler = MyNN.Scaler(state_size,w)
        self.variances = variances
        self.logvars = np.log(self.variances)
        self.var_cov = np.diagflat(variances)
        self.replay_buff_x = None
        self.replay_buff_y = None


    def _build_model(self):
        model = MyNN.MyNN(self.state_size)
        model.add(self.state_size*10, 'Tanh')
        model.add(int(np.sqrt(self.action_size*self.state_size)*10), 'Tanh')
        model.add(self.action_size*10, 'Tanh')
        model.add(self.action_size, 'Linear')
        model.compile(self.method, 'Adam')
        return model
    
    def _build_vf(self):
        model = MyNN.MyNN(self.state_size)
        model.add(self.state_size*10, 'Tanh')
        model.add(int(np.sqrt(self.action_size*50)), 'Tanh')
        model.add(5, 'Tanh')
        model.add(1, 'Linear')
        model.compile('MSE', 'Adam')
        return model
    
    def value_function_fit(self, x, y):
        if self.replay_buff_x is None:
            x_train, y_train = x, y
        else:
            x_train = np.hstack([x, self.replay_buff_x])
            y_train = np.hstack([y, self.replay_buff_y])
        self.replay_buff_x = x
        self.replay_buff_y = y
        self.critic.optimize(x_train, y_train, num_epochs=10, 
                             report_cost=False, batch_size=128, 
                            lr = self.critic_lr)

    def get_means(self, state):
        result = self.actor.forward(state, caching='no')
        return result
    
    def update_variances(self, actions, means, adv):
        var = self.variances.reshape(self.action_size,1)
        n = means.shape[1]
        grads = np.square(actions-means)/var
        grads = (grads-1)/(2*var)
        grads = (1/n)*grads*adv
        grads = np.sum(grads, axis=1, keepdims=True)*2
        logvars = self.logvars.reshape(self.action_size,1) + agent.actor_lr*2*agent.action_size*grads
        self.logvars = logvars.reshape(1,-1)
        self.variances = np.exp(self.logvars)
        self.var_cov = np.diagflat(self.variances)
        return grads

In [4]:
def running_reward(rewards, gamma):
    result = []
    run_rew = 0
    for reward in  rewards[0,:][::-1]:
        run_rew = run_rew*gamma + reward
        result.append(run_rew)
    return np.array(result[::-1]).reshape(1,-1)

def add_gae(traj, gamma, lam):
    rewards = traj['rewards'] #*(1-gamma)
    values = traj['values']
    traj['tds'] = rewards - values + np.append(traj['values'][0, 1:] * agent.gamma, 0).reshape(1,-1)
    gae = running_reward(traj['tds'], gamma*lam)
    return gae

In [60]:
def play_game(agent, render=False):
    state = env.reset().reshape((agent.state_size,1))
    unscaled_states = []
    states = []
    meanss = []
    actions = []
    rewards = []
    done = False
    mean, var = agent.scaler.get()
    while not done:
        if render:
            env.render()
        unscaled_states.append(state)
        scaled_state = (state-mean)/var
        states.append(scaled_state)
        means = agent.get_means(scaled_state)
        meanss.append(means)
        action = np.random.multivariate_normal(means[:,0], agent.var_cov)
        actions.append(action.reshape(agent.action_size, 1))
        state, reward, done, _ = env.step(action)
        state = state.reshape((agent.state_size,1))
        rewards.append(reward)
        if done:
            break
    return {'unscaled': np.hstack(unscaled_states), 'states' : np.hstack(states),
            'means': np.hstack(meanss), 'actions': np.hstack(actions),
            'rewards': np.array(rewards).reshape(1,-1)}

In [6]:
def play_n_games(agent, n=20):
    trajectories = []
    for i in range(n):
        trajectory = play_game(agent)
        trajectories.append(trajectory)
    return trajectories

In [7]:
def build_train_set(agent, trajectories):
    for traj in trajectories:
        traj['disc_sum_rew'] = running_reward(traj['rewards'], agent.gamma) #*(1-agent.gamma)
        traj['values'] = agent.critic.forward(traj['states'])
        traj['GAE'] = add_gae(traj, agent.gamma, agent.lam)
    X = np.hstack([t['states'] for t in trajectories])
    Y = np.hstack([t['means'] for t in trajectories])
    actions = np.hstack([t['actions'] for t in trajectories])
    disc_sum_rew = np.hstack([t['disc_sum_rew'] for t in trajectories])
    adv = np.hstack([t['GAE'] for t in trajectories])
    adv = (adv - adv.mean())/(adv.std() + 1e-6)
    adv = adv
    return X, Y, actions, adv, disc_sum_rew

In [156]:
env = gym.make('InvertedDoublePendulum-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]
variances = env.action_space.high - env.action_space.low
agent = ACAgent(state_size, action_size, actor_lr=1e-3,
                critic_lr=1e-2, gamma=0.95, lam=0.98, method='Cont',
               variances= variances/10, w=1)

In [157]:
for time in range(1, 1001):
    trajectories = play_n_games(agent, n=10)
    agent.scaler.update(np.hstack([t['unscaled'] for t in trajectories]))
    score = np.mean([(np.sum(t['rewards']), np.var(t['rewards'])) for t in trajectories], axis=0)
    X_batch, Y_batch, actions, adv, disc_sum_rew = build_train_set(agent, trajectories)
    agent.value_function_fit(X_batch, disc_sum_rew)
    A = agent.actor.forward(X_batch)
    agent.actor.cache['A0'] = X_batch
    agent.actor.backward(_, Y_batch, adv, actions=actions,
                         variances=agent.variances.reshape(agent.action_size, 1))
    agent.actor.number_of_updates +=1
    agent.actor.update_parameters()
    grads = agent.update_variances(actions, Y_batch, adv)
    print(time, score[0], agent.variances)

1 63.72125110688908 [[0.1997985]]
2 69.49132974980752 [[0.1996264]]
3 63.74233279498636 [[0.19929372]]
4 59.990079677838516 [[0.19920917]]
5 62.75106428879417 [[0.19949202]]
6 61.95746313742403 [[0.19936592]]
7 70.27353861951676 [[0.19956836]]
8 78.65183929628287 [[0.1993013]]
9 78.67595516701147 [[0.19907172]]
10 63.74143387857962 [[0.1986097]]
11 94.60521398160874 [[0.19843281]]
12 81.43049733617207 [[0.19769285]]
13 85.19635038167766 [[0.19781804]]
14 82.5468732871815 [[0.19801928]]
15 83.51462989844723 [[0.19807567]]
16 90.81700506178359 [[0.19750229]]
17 93.63477675955025 [[0.19712038]]
18 117.7141403781508 [[0.19686179]]
19 96.2993390782856 [[0.19668522]]
20 99.02504679638841 [[0.19656202]]
21 118.48942782215867 [[0.19633152]]
22 99.05443018829274 [[0.19642373]]
23 116.76921143804232 [[0.19636074]]
24 110.42352820234191 [[0.19576544]]
25 112.97440554903758 [[0.19596221]]
26 130.55841753228472 [[0.19604225]]
27 99.99339158424247 [[0.19596043]]
28 136.89008477388893 [[0.19589796]]


In [165]:
traj = play_game(agent, render=True)