In [1]:
import os
path = os.environ['MyNN']
os.sys.path.append(path)
import MyNN
import gym
import numpy as np
import collections

In [2]:
class ACAgent:
    def __init__(self, state_size, action_size, gamma=0.95,learning_rate=0.01):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  # discount rate
        self.learning_rate = learning_rate
        self.actor = self._build_model()
        self.critic = self._build_vf()
        self.scaler = MyNN.Scaler(state_size)
        self.replay_buff_x = None
        self.replay_buff_y = None


    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = MyNN.MyNN(128)
        model.add(50, 'Tanh')
        model.add(20, 'Tanh')
        model.add(1, 'Sigmoid')
        model.compile('Cross entropy', 'Adam')
        return model

    
    def _build_vf(self):
        model = MyNN.MyNN(128)
        model.add(50, 'Tanh')
        model.add(20, 'Tanh')
        model.add(1, 'ReLU')
        model.compile('MSE', 'Adam')
        return model
    
    def value_function_fit(self, x, y):
        if self.replay_buff_x is None:
            x_train, y_train = x, y
        else:
            x_train = np.hstack([x, self.replay_buff_x])
            y_train = np.hstack([y, self.replay_buff_y])
        self.replay_buff_x = x
        self.replay_buff_y = y
        self.critic.optimize(x_train, y_train, num_epochs=10, report_cost=False, batch_size=128)

    def act(self, state):
        result = self.actor.forward(state)[0][0]
        return 1 if result > np.random.random() else 0

In [3]:
def running_reward(rewards, gamma):
    result = []
    run_rew = 0
    for reward in  rewards[0,:][::-1]:
        if reward != 0:
            run_rew = 0
        run_rew = run_rew*gamma + reward
        result.append(run_rew)
    return np.array(result[::-1]).reshape(1,-1)

In [11]:
def play_game(agent, render=False):
    state = env.reset().reshape((agent.state_size,1))
    unscaled_states = []
    states = []
    actions = []
    rewards = []
    done = False
    score = [0,0]
    mean, var = agent.scaler.get()
    while not done:
        if render:
            env.render()
        unscaled_states.append(state)
        scaled_state = (state-mean)/var
        states.append(scaled_state)
        action = agent.act(scaled_state)
        actions.append(action)
        state, reward, done, _ = env.step(action+2)
        state = state.reshape((agent.state_size,1))
        rewards.append(reward)
        if reward == 1:
            score[1] +=1
        elif reward == -1:
            score[0] +=1
    unscaled_states = np.hstack(unscaled_states)
    states = np.hstack(states)
    return {'unscaled': unscaled_states, 'states' : states,
            'actions': np.array(actions).reshape(1,-1),'rewards': np.array(rewards).reshape(1,-1),
           'scores' : score}

In [22]:
def play_n_games(agent, n=5):
    trajectories = []
    for i in range(n):
        trajectory = play_game(agent)
        trajectories.append(trajectory)
    return trajectories

In [13]:
def build_train_set(agent, trajectories):
    for traj in trajectories:
        traj['disc_sum_rew'] = running_reward(traj['rewards'], agent.gamma)
        traj['values'] = agent.actor.forward(traj['states'])
    X = np.hstack([t['states'] for t in trajectories])
    Y = np.hstack([t['actions'] for t in trajectories])
    disc_sum_rew = np.hstack([t['disc_sum_rew'] for t in trajectories])
    values = np.hstack([t['values'] for t in trajectories])
    adv = disc_sum_rew - values
    adv = (adv - adv.mean())/(adv.std() + 1e-6)
    return X, Y, adv, disc_sum_rew

In [23]:
env = gym.make('Pong-ram-v0')
state_size = env.observation_space.shape[0]
agent = ACAgent(state_size, 1, learning_rate=0.001)

In [24]:
%%time
for time in range(1, 10001):
    trajectories = play_n_games(agent, n=5)
    agent.scaler.update(np.hstack([t['unscaled'] for t in trajectories]))
    score = np.mean([t['scores'] for t in trajectories], axis=0)
    print(time, score)
    X_batch, Y_batch, weights, disc_sum_rew = build_train_set(agent, trajectories)
    agent.actor.optimize(X_batch, Y_batch, weights=weights,
                         lr=agent.learning_rate, num_epochs=1, report_cost=False)
    agent.value_function_fit(X_batch, disc_sum_rew)

1 [21.   0.4]
2 [21.   0.4]
3 [21.   0.8]
4 [21.   0.6]
5 [21.  0.]
6 [21.   0.8]
7 [21.   0.2]
8 [21.  0.]
9 [21.   0.8]
10 [21.  1.]
11 [21.   1.6]
12 [21.   0.2]
13 [21.  0.]
14 [21.  0.]
15 [21.   1.4]
16 [21.   0.8]
17 [21.   0.4]
18 [21.   0.2]
19 [21.   0.2]
20 [21.   0.6]
21 [21.   0.4]
22 [21.   0.6]
23 [21.   0.6]
24 [21.   0.6]
25 [21.   0.8]
26 [21.  0.]
27 [21.   0.6]
28 [21.   0.8]
29 [21.   1.2]
30 [21.   0.8]
31 [21.   0.4]
32 [21.   0.4]
33 [21.   0.4]
34 [21.   0.6]
35 [21.   0.6]
36 [21.  0.]
37 [21.   0.4]
38 [21.   0.6]
39 [21.   0.4]
40 [21.   0.6]
41 [21.   0.2]
42 [21.   0.6]
43 [21.  0.]
44 [21.   0.2]
45 [21.   0.6]
46 [21.   0.8]
47 [21.   0.2]
48 [21.   0.6]
49 [21.   0.2]
50 [21.   0.4]
51 [21.  1.]
52 [21.   0.6]
53 [21.   0.2]
54 [21.   0.4]
55 [21.   0.4]
56 [21.   0.4]
57 [21.   0.6]
58 [21.   0.4]
59 [21.   0.4]
60 [21.   0.4]
61 [21.   0.8]
62 [21.   0.6]
63 [21.   0.4]
64 [21.   0.8]
65 [21.   0.6]
66 [21.   0.8]
67 [21.   0.6]
68 [21.   0.4]
69 [21.

KeyboardInterrupt: 

In [25]:
trajectory = play_game(agent, render=True)

In [26]:
%%time
for time in range(2867, 10001):
    trajectories = play_n_games(agent, n=5)
    agent.scaler.update(np.hstack([t['unscaled'] for t in trajectories]))
    score = np.mean([t['scores'] for t in trajectories], axis=0)
    print(time, score)
    X_batch, Y_batch, weights, disc_sum_rew = build_train_set(agent, trajectories)
    agent.actor.optimize(X_batch, Y_batch, weights=weights,
                         lr=agent.learning_rate, num_epochs=1, report_cost=False)
    agent.value_function_fit(X_batch, disc_sum_rew)

2867 [21.  7.]
2868 [21.   6.2]
2869 [21.   6.6]
2870 [21.   7.6]
2871 [21.   9.2]
2872 [21.   6.8]
2873 [21.   6.2]
2874 [21.   6.2]
2875 [21.   5.4]
2876 [21.   6.4]
2877 [21.   8.2]
2878 [21.   5.4]
2879 [21.   6.4]
2880 [21.   5.2]
2881 [21.  5.]
2882 [21.   7.2]
2883 [21.   7.2]
2884 [21.   6.6]
2885 [21.   6.4]
2886 [21.  6.]
2887 [21.   6.4]
2888 [21.   5.2]
2889 [21.   6.8]
2890 [21.   7.2]
2891 [21.  7.]
2892 [21.  6.]
2893 [21.   5.4]
2894 [21.   7.4]
2895 [21.   6.6]
2896 [21.   5.4]
2897 [21.  6.]
2898 [21.   4.8]
2899 [21.   8.8]
2900 [21.   5.4]
2901 [21.  7.]
2902 [21.  6.]
2903 [21.   6.6]
2904 [21.   6.6]
2905 [21.   6.6]
2906 [21.   7.4]
2907 [21.   5.6]
2908 [21.   7.6]
2909 [21.   7.4]
2910 [21.   6.6]
2911 [21.   7.2]
2912 [21.   6.6]
2913 [21.   8.4]
2914 [21.   8.4]
2915 [21.   7.8]
2916 [21.   7.6]
2917 [21.  8.]
2918 [21.  7.]
2919 [21.   7.4]
2920 [21.  9.]
2921 [21.   6.4]
2922 [21.   7.8]
2923 [21.   6.4]
2924 [21.   7.4]
2925 [21.   5.8]
2926 [21.  7.]
2927

KeyboardInterrupt: 

In [27]:
trajectory = play_game(agent, render=True)

In [30]:
X_batch.shape

(128, 26833)