In [1]:
import os
path = os.environ['MyNN']
os.sys.path.append(path)
import MyNN
import gym
import numpy as np
import collections

In [14]:
class ACAgent:
    def __init__(self, state_size, action_size, gamma=0.95, lam=0.98, actor_lr=0.001, critic_lr=0.001):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  # discount rate
        self.lam = lam # GAE disc rate
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.actor = self._build_model()
        self.critic = self._build_vf()
        self.actor.lr = actor_lr
        self.critic.lr = critic_lr
        self.scaler = MyNN.Scaler(state_size)
        self.replay_buff_x = None
        self.replay_buff_y = None


    def _build_model(self):
        model = MyNN.MyNN(self.state_size)
        model.add(24, 'Tanh')
        model.add(24, 'Tanh')
        model.add(2, 'Softmax')
        model.compile('TRPO', 'Adam')
        return model
    
    def _build_vf(self):
        model = MyNN.MyNN(self.state_size)
        model.add(24, 'Tanh')
        model.add(24, 'Tanh')
        model.add(1, 'Linear')
        model.compile('MSE', 'Adam')
        return model
    
    def value_function_fit(self, x, y):
        if self.replay_buff_x is None:
            x_train, y_train = x, y
        else:
            x_train = np.hstack([x, self.replay_buff_x])
            y_train = np.hstack([y, self.replay_buff_y])
        self.replay_buff_x = x
        self.replay_buff_y = y
        self.critic.optimize(x_train, y_train, num_epochs=10, 
                             report_cost=False, batch_size=128, 
                            lr = self.critic_lr)

    def act(self, state):
        result = self.actor.forward(state, caching='no')
        return result

In [35]:
def running_reward(rewards, gamma):
    result = []
    run_rew = 0
    for reward in  rewards[0,:][::-1]:
        run_rew = run_rew*gamma + reward
        result.append(run_rew)
    return np.array(result[::-1]).reshape(1,-1)

def encode(actions, action_size):
    result = np.zeros((action_size, actions.shape[1]))
    result[actions, range(actions.shape[1])] = 1
    #result[result != 1] = -1
    return result

def add_gae(traj, gamma, lam):
    rewards = traj['rewards']*(1-gamma)
    values = traj['values']
    traj['tds'] = rewards - values + np.append(traj['values'][0, 1:] * agent.gamma, 0).reshape(1,-1)
    gae = running_reward(traj['tds'], gamma*lam)
    return gae

In [4]:
def play_game(agent, render=False):
    state = env.reset().reshape((agent.state_size,1))
    unscaled_states = []
    states = []
    probs = []
    actions = []
    rewards = []
    done = False
    mean, var = agent.scaler.get()
    for t in range(499):
        if render:
            env.render()
        unscaled_states.append(state)
        scaled_state = (state-mean)/var
        states.append(scaled_state)
        prob = agent.act(scaled_state)
        probs.append(prob)
        action = np.random.choice(range(agent.action_size), p=prob[:,0])
        actions.append(action)
        state, reward, done, _ = env.step(action)
        state = state.reshape((agent.state_size,1))
        if done:
            reward = -10
        if t==498:
            reward = 20
        rewards.append(reward)
        if done:
            break
    return {'unscaled': np.hstack(unscaled_states), 'states' : np.hstack(states),
            'probs': np.hstack(probs), 'actions': np.array(actions).reshape(1,-1),
            'rewards': np.array(rewards).reshape(1,-1),'time' : t}

In [5]:
def play_n_games(agent, n=20):
    trajectories = []
    for i in range(n):
        trajectory = play_game(agent)
        trajectories.append(trajectory)
    return trajectories

In [38]:
def build_train_set(agent, trajectories):
    for traj in trajectories:
        traj['disc_sum_rew'] = running_reward(traj['rewards']*(1-agent.gamma), agent.gamma)
        traj['values'] = agent.critic.forward(traj['states'])
        traj['GAE'] = add_gae(traj, agent.gamma, agent.lam)
    X = np.hstack([t['states'] for t in trajectories])
    Y = np.hstack([t['probs'] for t in trajectories])
    disc_sum_rew = np.hstack([t['disc_sum_rew'] for t in trajectories])
    #values = np.hstack([t['values'] for t in trajectories])
    adv = np.hstack([t['GAE'] for t in trajectories])
    adv = (adv - adv.mean())/(adv.std() + 1e-6)
    adv = np.hstack([encode(t['actions'], agent.action_size) for t in trajectories]) * adv
    return X, Y, adv, disc_sum_rew

In [79]:
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = ACAgent(state_size, action_size, actor_lr=0.001, critic_lr=0.002)
l=0
DKL_targ = 0.001
beta = 1

In [80]:
%%time
for time in range(1, 251):
    trajectories = play_n_games(agent, n=5)
    agent.scaler.update(np.hstack([t['unscaled'] for t in trajectories]))
    reward = (np.mean([t['time'] for t in trajectories]), np.var([t['time'] for t in trajectories]))
    X_batch, Y_batch, adv, disc_sum_rew = build_train_set(agent, trajectories)
    agent.value_function_fit(X_batch, disc_sum_rew)
    if reward[0] != 498:
        for i in range(10):
            Z = agent.actor.forward(X_batch)
            DKL = np.sum(Y_batch*np.log(np.divide(Y_batch,Z)))/Y_batch.shape[1]
        #        print(DKL)
            if DKL > DKL_targ*4:
                print('Beta is too small')
                break
            agent.actor.cache['A0'] = X_batch
            agent.actor.backward(Z, Y_batch, adv, beta)
            agent.actor.number_of_updates +=1
            agent.actor.update_parameters()
        if DKL > DKL_targ*2:
            beta = np.minimum(35, beta*1.5)
            print('Increasing beta')
        elif DKL < DKL_targ*0.5:
            beta = np.maximum(1/35, beta/1.5)
            print('Decreasing beta')
        else:
            print('Beta remains the same')
    print(time, reward, DKL, beta)
    if reward[0] == 498:
        l+=1
    else:
        l=0
    if l==10:
        break

Beta remains the same
1 (41.6, 231.04000000000002) 0.0018769556959350646 1
Beta remains the same
2 (35.6, 130.64) 0.0005322705802646676 1
Beta remains the same
3 (44.6, 497.03999999999996) 0.0016935434311108235 1
Beta remains the same
4 (58.0, 591.6) 0.0018767329093691899 1
Beta is too small
Increasing beta
5 (52.6, 373.04) 0.004523015159457567 1.5
Beta is too small
Increasing beta
6 (64.2, 537.76) 0.004658483287029868 2.25
Beta remains the same
7 (85.4, 1304.2400000000002) 0.0015762443096416121 2.25
Beta remains the same
8 (86.0, 450.4) 0.0010380896505946485 2.25
Increasing beta
9 (121.4, 1170.6399999999999) 0.0022752077497171434 3.375
Decreasing beta
10 (242.0, 5538.8) 0.00019924207944218102 2.25
Decreasing beta
11 (198.4, 3267.4399999999996) 0.00024000737192406856 1.5
Decreasing beta
12 (211.8, 3066.16) 0.00028888648804013703 1.0
Beta remains the same
13 (240.0, 4854.8) 0.001295696189360415 1.0
Beta is too small
Increasing beta
14 (237.8, 7688.160000000002) 0.0046102644400073944 1.5

In [81]:
%%time
count = 0
for i in range(100):
    traj = play_game(agent)
    count += traj['time']
print(count/100)

490.18
CPU times: user 15 s, sys: 45.7 ms, total: 15 s
Wall time: 15 s


In [82]:
traj = play_game(agent, render=True)