In [1]:
from keras.layers import TimeDistributed, Dense, Flatten ,Dropout
from keras.layers import Conv2D,LSTM
from keras import optimizers
from keras import initializers
from keras.models import Sequential
from keras import backend as K
import tensorflow as tf
import numpy as np
import random
import cv2
import gym
import gym_ple
from collections import deque


Using TensorFlow backend.


couldn't import doomish
Couldn't import doom


In [2]:
img_rows , img_cols = 47, 47

class DQNAgent:
    def __init__(self):

        self.action_size = 2

        # These are hyper parameters for the DQN
        self.discount_factor = 0.99
        self.learning_rate = 0.00025
        self.epsilon = 1
        self.epsilon_min = 0.1
        self.epsilon_max = 1
        self.batch_size = 32
        self.train_start = 10000
        self.memory = deque(maxlen=300000)
        # create main model and target model
        self.model = self.build_model()
        self.target_model = self.build_model()

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
        
    def build_model(self):
        model = Sequential()
        model.add(TimeDistributed(Conv2D(32, kernel_size=(3,3), strides=(2, 2),activation='relu',kernel_initializer=initializers.random_normal(stddev=0.01)),input_shape=(5, 47, 47, 1)))
        model.add(TimeDistributed(Conv2D(32, kernel_size=(3,3), strides=(2, 2),activation='relu',kernel_initializer=initializers.random_normal(stddev=0.01))))
        model.add(TimeDistributed(Conv2D(32, kernel_size=(3,3), strides=(2, 2),activation='relu',kernel_initializer=initializers.random_normal(stddev=0.01))))
        model.add(TimeDistributed(Flatten()))
        model.add(LSTM(256,activation='relu',kernel_initializer=initializers.random_normal(stddev=0.01)))
        model.add(Dense(self.action_size,kernel_initializer=initializers.random_normal(stddev=0.01)))
        model.summary()
        model.compile(loss='mse', optimizer=optimizers.Adam(lr=self.learning_rate))
        return model

    # get action from model using epsilon-greedy policy
    def get_action(self,state):
        state = state.reshape(1, 5, 47, 47, 1)
        q_value = self.model.predict(state)[0]
        prob = np.exp(q_value/self.epsilon)/np.sum(np.exp(q_value/self.epsilon))
        a = np.random.choice(self.action_size, 1, p=prob)[0]
        return a

    # save sample <s,a,r,s'> to the replay memory
    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    # pick samples randomly from replay memory (with batch_size)
    def train_model(self,mini_batch):

        #do training if buffer is full
        batch_size = len(mini_batch)       
        states = np.zeros((batch_size,5,img_rows, img_cols, 1))
        next_states = np.zeros((batch_size,5,img_rows, img_cols, 1))
        target = np.zeros((batch_size, self.action_size)) 
        action, reward, done = [], [], []

        for i in range(batch_size):
            states[i] = mini_batch[i][0]
            action.append(mini_batch[i][1])
            reward.append(mini_batch[i][2])
            next_states[i] = mini_batch[i][3]
            done.append(mini_batch[i][4])

        target = self.model.predict(states)            
        target_next = self.model.predict(next_states)
        target_val = self.target_model.predict(next_states)

        for i in range(batch_size):
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                a = np.argmax(target_next[i])
                target[i][action[i]] = reward[i] + self.discount_factor * (
                    target_val[i][a])

        self.model.fit(states, target, self.batch_size, epochs=1, verbose=0)


In [3]:
def get_initial_state(img):
    img = cv2.cvtColor(cv2.resize(img, (47, 47)), cv2.COLOR_BGR2GRAY)
    ret, x_t = cv2.threshold(img,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
    x_t = x_t.reshape(47, 47, 1)
    state = deque(maxlen=5)
    for _ in range(5):
        state.append(x_t)
    return state

In [None]:
agent = DQNAgent()
scores, episodes = [], []
action_count = 0
ACTIONS = 2
EXPLORE = 200000
env = gym.make('FlappyBird-v0')
EPISODES = 50000
np.random.seed(123)
num_frame = 0

for e in range(EPISODES):
    done = False
    score = 0
    img = env.reset()
    image_queue = get_initial_state(img)
    state = np.array(image_queue)
    
    while not done:        
        action = agent.get_action(state)
        next_img, reward, done,_ = env.step(action)
        
        #process next image
        next_img = cv2.cvtColor(cv2.resize(next_img, (47, 47)), cv2.COLOR_BGR2GRAY)
        ret, next_img = cv2.threshold(next_img,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
        next_img = next_img.reshape(img_rows, img_cols, 1) #80x80x1
        image_queue.append(next_img)
        next_state = np.array(image_queue)
        
        agent.append_sample(state, action, reward, next_state, done)
        
        if len(agent.memory) > agent.train_start:
            if agent.epsilon > agent.epsilon_min:
                agent.epsilon -= (agent.epsilon_max - agent.epsilon_min) / EXPLORE
            mini_batch = random.sample(agent.memory, agent.batch_size)
            agent.train_model(mini_batch)        
        
        score += reward
        state = next_state
        num_frame += 1
        
        if num_frame % 1000 == 0:
            agent.update_target_model()
        
        if done:
            scores.append(score)
            episodes.append(e)
            print("episode:", e, " score:", score, " epsilon:", agent.epsilon," num_frame:", num_frame)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_1 (TimeDist (None, 5, 23, 23, 32)     320       
_________________________________________________________________
time_distributed_2 (TimeDist (None, 5, 11, 11, 32)     9248      
_________________________________________________________________
time_distributed_3 (TimeDist (None, 5, 5, 5, 32)       9248      
_________________________________________________________________
time_distributed_4 (TimeDist (None, 5, 800)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               1082368   
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 514       
Total params: 1,101,698
Trainable params: 1,101,698
Non-trainable params: 0
_________________________________________________________________


[2017-11-09 13:24:17,819] Making new env: FlappyBird-v0


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_5 (TimeDist (None, 5, 23, 23, 32)     320       
_________________________________________________________________
time_distributed_6 (TimeDist (None, 5, 11, 11, 32)     9248      
_________________________________________________________________
time_distributed_7 (TimeDist (None, 5, 5, 5, 32)       9248      
_________________________________________________________________
time_distributed_8 (TimeDist (None, 5, 800)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               1082368   
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 514       
Total params: 1,101,698
Trainable params: 1,101,698
Non-trainable params: 0
_________________________________________________________________


episode: 131  score: -5.0  epsilon: 1  num_frame: 6881
episode: 132  score: -5.0  epsilon: 1  num_frame: 6943
episode: 133  score: -5.0  epsilon: 1  num_frame: 6995
episode: 134  score: -5.0  epsilon: 1  num_frame: 7035
episode: 135  score: -5.0  epsilon: 1  num_frame: 7082
episode: 136  score: -5.0  epsilon: 1  num_frame: 7132
episode: 137  score: -5.0  epsilon: 1  num_frame: 7194
episode: 138  score: -5.0  epsilon: 1  num_frame: 7256
episode: 139  score: -5.0  epsilon: 1  num_frame: 7303
episode: 140  score: -5.0  epsilon: 1  num_frame: 7354
episode: 141  score: -5.0  epsilon: 1  num_frame: 7418
episode: 142  score: -5.0  epsilon: 1  num_frame: 7470
episode: 143  score: -5.0  epsilon: 1  num_frame: 7528
episode: 144  score: -5.0  epsilon: 1  num_frame: 7590
episode: 145  score: -5.0  epsilon: 1  num_frame: 7652
episode: 146  score: -5.0  epsilon: 1  num_frame: 7707
episode: 147  score: -5.0  epsilon: 1  num_frame: 7747
episode: 148  score: -5.0  epsilon: 1  num_frame: 7797
episode: 1

episode: 259  score: -5.0  epsilon: 0.9838585000001332  num_frame: 13587
episode: 260  score: -5.0  epsilon: 0.9836290000001351  num_frame: 13638
episode: 261  score: -5.0  epsilon: 0.9833500000001374  num_frame: 13700
episode: 262  score: -5.0  epsilon: 0.9830845000001396  num_frame: 13759
episode: 263  score: -5.0  epsilon: 0.9828640000001414  num_frame: 13808
episode: 264  score: -5.0  epsilon: 0.9825850000001437  num_frame: 13870
episode: 265  score: -5.0  epsilon: 0.9823330000001458  num_frame: 13926
episode: 266  score: -5.0  epsilon: 0.9820945000001478  num_frame: 13979
episode: 267  score: -5.0  epsilon: 0.9818875000001495  num_frame: 14025
episode: 268  score: -5.0  epsilon: 0.9816490000001514  num_frame: 14078
episode: 269  score: -5.0  epsilon: 0.9813700000001537  num_frame: 14140
episode: 270  score: -5.0  epsilon: 0.981091000000156  num_frame: 14202
episode: 271  score: -5.0  epsilon: 0.9808120000001583  num_frame: 14264
episode: 272  score: -5.0  epsilon: 0.98053300000016

episode: 372  score: -5.0  epsilon: 0.9554320000003678  num_frame: 19904
episode: 373  score: -5.0  epsilon: 0.9551980000003697  num_frame: 19956
episode: 374  score: -5.0  epsilon: 0.9549550000003717  num_frame: 20010
episode: 375  score: -5.0  epsilon: 0.9547030000003738  num_frame: 20066
episode: 376  score: -5.0  epsilon: 0.9544645000003757  num_frame: 20119
episode: 377  score: -5.0  epsilon: 0.954185500000378  num_frame: 20181
episode: 378  score: -5.0  epsilon: 0.9539245000003802  num_frame: 20239
episode: 379  score: -5.0  epsilon: 0.9537265000003818  num_frame: 20283
episode: 380  score: -5.0  epsilon: 0.9534520000003841  num_frame: 20344
episode: 381  score: -5.0  epsilon: 0.9532045000003861  num_frame: 20399
episode: 382  score: -5.0  epsilon: 0.9529615000003882  num_frame: 20453
episode: 383  score: -5.0  epsilon: 0.9527050000003903  num_frame: 20510
episode: 384  score: -5.0  epsilon: 0.9524260000003926  num_frame: 20572
episode: 385  score: -5.0  epsilon: 0.95219200000039

episode: 485  score: -5.0  epsilon: 0.9273295000005997  num_frame: 26149
episode: 486  score: -5.0  epsilon: 0.9271135000006014  num_frame: 26197
episode: 487  score: -5.0  epsilon: 0.9268525000006036  num_frame: 26255
episode: 488  score: -5.0  epsilon: 0.9266680000006051  num_frame: 26296
episode: 489  score: -5.0  epsilon: 0.9263890000006074  num_frame: 26358
episode: 490  score: -5.0  epsilon: 0.9261100000006097  num_frame: 26420
episode: 491  score: -5.0  epsilon: 0.9258850000006116  num_frame: 26470
episode: 492  score: -5.0  epsilon: 0.9256060000006139  num_frame: 26532
episode: 493  score: -5.0  epsilon: 0.9253720000006158  num_frame: 26584
episode: 494  score: -5.0  epsilon: 0.9251380000006177  num_frame: 26636
episode: 495  score: -4.0  epsilon: 0.924746500000621  num_frame: 26723
episode: 496  score: -5.0  epsilon: 0.9245620000006225  num_frame: 26764
episode: 497  score: -5.0  epsilon: 0.9243100000006246  num_frame: 26820
episode: 498  score: -5.0  epsilon: 0.92405800000062