In [29]:
from lib import wrappers_ben as wb
from lib import dqn_model_ben as dmb

import time
import numpy as np
import collections
import gym
from collections import namedtuple

#Hyparameter
DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
GAMMA = 0.99
BATCH_SIZE = 32
LEARNING_RATE = 1e-4
# EPSILON_START = 1.0
EPSILON_START = 0.02
EPSILON_DECAY_LAST_FRAME = 10**5
EPSILON_FINAL = 0.02
EPISODES = 1000000
START_STEPS = 3000
COPY_STEP = 1000
REPORT_EVERY_STEP = 50000
REPLAY_START_SIZE = 3000

Experience = namedtuple('Experience', field_names=['obs', 'action', 'reward', 'done', 'next_obs'])
class Agent:
    def __init__(self, env):
        self.env = env
        self.exp_buffer = collections.deque(maxlen=REPLAY_START_SIZE)
        self._reset()
    def _reset(self):
        self.current_obs = self.env.reset()
        self.total_reward = 0.0
    def get_Qas(self, model, obs):
        Qas = model.predict(np.array([obs]), 1)
        return Qas
    def select_epision_greedy_action(self, model, obs, epsilon):
        Qas = self.get_Qas(model, obs)
        if np.random.random() < epsilon:
            action = self.env.action_space.sample()
        else:
            action = np.argmax(np.squeeze(Qas))
        return action
    def play_step(self, model, epsilon):
        action = self.select_epision_greedy_action(model, self.current_obs, epsilon)
        new_obs,reward,done,_ = self.env.step(action)
        self.total_reward += reward
        exp = Experience(self.current_obs, action, reward, done, new_obs)
        self.exp_buffer.append(exp)
        self.current_obs = new_obs
        if done:
            print("self.total_reward:", self.total_reward)
            self._reset()
    def test(self, DEFAULT_ENV_NAME, model, i_epoch):
        env = wb.make_env(DEFAULT_ENV_NAME)
        env = gym.wrappers.Monitor(env,"recording"+str(i_epoch), force=True)
        current_obs = env.reset()
        total_reward = 0
        while True:
            action = self.select_epision_greedy_action(model, current_obs, 0)
            new_obs,reward,done,_ = env.step(action)
            total_reward += reward
            if done:
                break
            current_obs = new_obs
        env.env.close()
        return total_reward
    def get_exp_buffer(self):
        test = np.array(self.exp_buffer)
        return self.exp_buffer
        
        
def sample_memories2(buffer, batch_size):
        indices = np.random.choice(len(buffer), batch_size, replace=False)
        states, actions, rewards, dones, next_states = zip(*[buffer[idx] for idx in indices])
        return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), \
               np.array(dones, dtype=np.uint8), np.array(next_states)
    
def one_hot(target, shape):
    b = np.zeros(shape)
    b[np.arange(shape[0]), target] = 1
    return b

In [27]:

# train
env = wb.make_env(DEFAULT_ENV_NAME)
myModel = dmb.MyModel(env.observation_space.shape, env.action_space.n)
myModel.load_weights("Weights_store/PongModelWeights999999-new1.h5")
agent = Agent(env)
his_myModel = myModel.export_model()
for i in range(EPISODES):
    epsilon = max(EPSILON_FINAL, EPSILON_START - i / EPSILON_DECAY_LAST_FRAME)
    agent.play_step(myModel, epsilon)
    exp_buffer = agent.get_exp_buffer()
    if i > START_STEPS:
        break;

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 84, 84, 4)]       0         
_________________________________________________________________
layer1 (Conv2D)              (None, 21, 21, 32)        8224      
_________________________________________________________________
activation_4 (Activation)    (None, 21, 21, 32)        0         
_________________________________________________________________
layer2 (Conv2D)              (None, 11, 11, 64)        32832     
_________________________________________________________________
activation_5 (Activation)    (None, 11, 11, 64)        0         
_________________________________________________________________
layer3 (Conv2D)              (None, 11, 11, 64)        36928     
_________________________________________________________________
activation_6 (Activation)    (None, 11, 11, 64)        0   

In [36]:
memories = sample_memories2(exp_buffer, 100)
current_obs_v = memories[0]
action_v = memories[1]
reward_v = memories[2]
done_v = memories[3]
next_obs_v = memories[4]

[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [49]:
print(done_v)
print(reward_v)
print(action_v)

[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[ 0.  0.  0.  0.  0.  0. -1.  0.  0.  0.  0. -1.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0. -1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[4 4 1 5 1 1 5 1 5 1 3 5 3 3 3 4 1 5 1 5 2 4 4 3 1 2 5 1 1 5 4 3 4 5 4 3 4
 3 4 5 4 3 1 4 4 4 3 4 3 1 5 5 3 4 5 3 5 4 3 4 5 5 5 3 3 5 4 5 5 5 4 3 4 4
 4 1 4 1 1 1 1 3 3 3 5 0 4 5 4 5 4 4 5 1 4 1 4 3 3 5]


In [40]:
Qas = myModel.predict(current_obs_v[:12], 32)
print(Qas)

[[6.4777226 6.5217366 6.4755254 6.5926943 6.439637  6.680376 ]
 [6.274074  6.468874  6.4192467 6.4074264 6.489381  6.592574 ]
 [6.4777226 6.5217366 6.4755254 6.5926943 6.439637  6.680376 ]
 [5.897474  5.9980726 6.0468388 6.0660343 5.9586926 6.066531 ]
 [6.4777226 6.5217366 6.4755254 6.5926943 6.439637  6.680376 ]
 [6.708174  7.0253286 6.8722754 7.005249  7.2433567 7.231888 ]
 [6.274074  6.468874  6.4192467 6.4074264 6.489381  6.592574 ]
 [5.897474  5.9980726 6.0468388 6.0660343 5.9586926 6.066531 ]
 [6.4777226 6.5217366 6.4755254 6.5926943 6.439637  6.680376 ]
 [5.897474  5.9980726 6.0468388 6.0660343 5.9586926 6.066531 ]
 [5.897474  5.9980726 6.0468388 6.0660343 5.9586926 6.066531 ]
 [6.4777226 6.5217366 6.4755254 6.5926943 6.439637  6.680376 ]]


In [41]:
Qas_next = his_myModel.predict(next_obs_v[:12])
print(Qas)

[[6.4777226 6.5217366 6.4755254 6.5926943 6.439637  6.680376 ]
 [6.274074  6.468874  6.4192467 6.4074264 6.489381  6.592574 ]
 [6.4777226 6.5217366 6.4755254 6.5926943 6.439637  6.680376 ]
 [5.897474  5.9980726 6.0468388 6.0660343 5.9586926 6.066531 ]
 [6.4777226 6.5217366 6.4755254 6.5926943 6.439637  6.680376 ]
 [6.708174  7.0253286 6.8722754 7.005249  7.2433567 7.231888 ]
 [6.274074  6.468874  6.4192467 6.4074264 6.489381  6.592574 ]
 [5.897474  5.9980726 6.0468388 6.0660343 5.9586926 6.066531 ]
 [6.4777226 6.5217366 6.4755254 6.5926943 6.439637  6.680376 ]
 [5.897474  5.9980726 6.0468388 6.0660343 5.9586926 6.066531 ]
 [5.897474  5.9980726 6.0468388 6.0660343 5.9586926 6.066531 ]
 [6.4777226 6.5217366 6.4755254 6.5926943 6.439637  6.680376 ]]


In [42]:
max_reward_v = np.max(Qas_next, axis=-1)
print(max_reward_v)

[6.680376  6.592574  6.680376  6.066531  6.680376  7.2433567 6.592574
 6.066531  6.680376  6.066531  6.066531  6.680376 ]


In [48]:
1-done_v[:12]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], dtype=uint8)

In [50]:
expect = reward_v[:12] + (GAMMA * max_reward_v) * (1 - done_v[:12])
print(expect)

[ 6.613572   6.5266485  6.613572   6.005866   6.613572   7.170923
  5.5266485  6.005866   6.613572   6.005866   6.005866  -1.       ]


In [53]:
indicis = one_hot(action_v[:12], Qas.shape)
Qas[indicis.nonzero()] = expect
print(Qas)

[[ 6.4777226  6.5217366  6.4755254  6.5926943  6.613572   6.680376 ]
 [ 6.274074   6.468874   6.4192467  6.4074264  6.5266485  6.592574 ]
 [ 6.4777226  6.613572   6.4755254  6.5926943  6.439637   6.680376 ]
 [ 5.897474   5.9980726  6.0468388  6.0660343  5.9586926  6.005866 ]
 [ 6.4777226  6.613572   6.4755254  6.5926943  6.439637   6.680376 ]
 [ 6.708174   7.170923   6.8722754  7.005249   7.2433567  7.231888 ]
 [ 6.274074   6.468874   6.4192467  6.4074264  6.489381   5.5266485]
 [ 5.897474   6.005866   6.0468388  6.0660343  5.9586926  6.066531 ]
 [ 6.4777226  6.5217366  6.4755254  6.5926943  6.439637   6.613572 ]
 [ 5.897474   6.005866   6.0468388  6.0660343  5.9586926  6.066531 ]
 [ 5.897474   5.9980726  6.0468388  6.005866   5.9586926  6.066531 ]
 [ 6.4777226  6.5217366  6.4755254  6.5926943  6.439637  -1.       ]]
