In [1]:
import os
path = os.environ['MyNN']
os.sys.path.append(path)
import MyNN
import gym
import numpy as np
import collections

In [2]:
class ACAgent:
    def __init__(self, state_size, action_size, gamma=0.95,learning_rate=0.01):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  # discount rate
        self.learning_rate = learning_rate
        self.actor = self._build_model()
        self.critic = self._build_vf()
        self.scaler = MyNN.Scaler(state_size)
        self.replay_buff_x = None
        self.replay_buff_y = None


    def _build_model(self):
        model = MyNN.MyNN(self.state_size)
        model.add(24, 'Tanh')
        model.add(24, 'Tanh')
        model.add(1, 'Sigmoid')
        model.compile('Cross entropy', 'Adam')
        return model
    
    def _build_vf(self):
        model = MyNN.MyNN(self.state_size)
        model.add(24, 'Tanh')
        model.add(24, 'Tanh')
        model.add(1, 'ReLU')
        model.compile('MSE', 'Adam')
        return model
    
    def value_function_fit(self, x, y):
        if self.replay_buff_x is None:
            x_train, y_train = x, y
        else:
            x_train = np.hstack([x, self.replay_buff_x])
            y_train = np.hstack([y, self.replay_buff_y])
        self.replay_buff_x = x
        self.replay_buff_y = y
        self.critic.optimize(x_train, y_train, num_epochs=10, report_cost=False, batch_size=128)

    def act(self, state):
        result = self.actor.forward(state)[0][0]
        return 1 if result > np.random.random() else 0

In [3]:
def running_reward(rewards, gamma):
    result = []
    run_rew = 0
    for reward in  rewards[0,:][::-1]:
        run_rew = run_rew*gamma + reward
        result.append(run_rew)
    return np.array(result[::-1]).reshape(1,-1)

In [4]:
def play_game(agent, render=False):
    state = env.reset().reshape((agent.state_size,1))
    unscaled_states = []
    states = []
    actions = []
    rewards = []
    done = False
    mean, var = agent.scaler.get()
    for t in range(499):
        if render:
            env.render()
        unscaled_states.append(state)
        scaled_state = (state-mean)/var
        states.append(scaled_state)
        action = agent.act(scaled_state)
        actions.append(action)
        state, reward, done, _ = env.step(action)
        if done:
            reward = -10
        if t==498:
            reward = 20
        state = state.reshape((agent.state_size,1))
        rewards.append(reward)
        if done:
            break
    unscaled_states = np.hstack(unscaled_states)
    states = np.hstack(states)
    return {'unscaled': unscaled_states, 'states' : states,
            'actions': np.array(actions).reshape(1,-1),'rewards': np.array(rewards).reshape(1,-1),
           'time': t}

In [5]:
def play_n_games(agent, n=20):
    trajectories = []
    for i in range(n):
        trajectory = play_game(agent)
        trajectories.append(trajectory)
    return trajectories

In [6]:
def build_train_set(agent, trajectories):
    for traj in trajectories:
        traj['disc_sum_rew'] = running_reward(traj['rewards'], agent.gamma)
        traj['values'] = agent.actor.forward(traj['states'])
    X = np.hstack([t['states'] for t in trajectories])
    Y = np.hstack([t['actions'] for t in trajectories])
    disc_sum_rew = np.hstack([t['disc_sum_rew'] for t in trajectories])
    values = np.hstack([t['values'] for t in trajectories])
    weights = disc_sum_rew - values
    return X, Y, weights, disc_sum_rew

In [9]:
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
agent = ACAgent(state_size, 1)

In [10]:
%%time
for time in range(1, 10001):
    trajectories = play_n_games(agent, n=5)
    agent.scaler.update(np.hstack([t['unscaled'] for t in trajectories]))
    reward = np.mean([t['time'] for t in trajectories])
    print(time, reward)
    if reward == 498:
        break
    X_batch, Y_batch, weights, disc_sum_rew = build_train_set(agent, trajectories)
    agent.actor.optimize(X_batch, Y_batch, weights=weights,
                         lr=agent.learning_rate, num_epochs=1, report_cost=False)
    agent.value_function_fit(X_batch, disc_sum_rew)

1 62.8
2 45.6
3 63.2
4 106.0
5 121.4
6 214.0
7 382.6
8 228.8
9 304.6
10 323.4
11 182.0
12 236.4
13 256.2
14 235.2
15 253.6
16 178.0
17 232.6
18 176.4
19 173.8
20 188.0
21 184.4
22 248.4
23 216.4
24 264.6
25 229.8
26 168.4
27 196.2
28 199.2
29 173.4
30 222.8
31 199.6
32 214.0
33 278.0
34 386.2
35 279.4
36 280.4
37 302.0
38 358.0
39 321.2
40 349.6
41 289.0
42 389.4
43 357.6
44 331.0
45 341.4
46 398.4
47 398.6
48 383.2
49 397.2
50 411.8
51 374.2
52 434.2
53 451.0
54 417.0
55 481.0
56 469.8
57 485.4
58 488.4
59 496.8
60 498.0
CPU times: user 51.9 s, sys: 12.4 s, total: 1min 4s
Wall time: 49.6 s
