In [None]:
from keras.layers import Input, Dense, Flatten, Lambda, merge
from keras.layers import Conv2D
from keras import optimizers
from keras import initializers
from keras.models import Model
from keras import backend as K
import tensorflow as tf
import numpy as np
import random
import cv2
import gym
import gym_ple
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [None]:
class SumTree(object):
    data_pointer = 0
    
    def __init__(self, capacity):
        self.capacity = capacity  # for all priority values
        self.tree = np.zeros(2 * capacity - 1)
        self.data = np.zeros(capacity, dtype=object)

    def add(self, p, data):
        tree_idx = self.data_pointer + self.capacity - 1
        self.data[self.data_pointer] = data  # update data_frame
        self.update(tree_idx, p)  # update tree_frame
        
        self.data_pointer += 1
        if self.data_pointer >= self.capacity:  # replace when exceed the capacity
            self.data_pointer = 0

    def update(self, tree_idx, p):
        change = p - self.tree[tree_idx]
        self.tree[tree_idx] = p
        # then propagate the change through tree
        while tree_idx != 0:    # this method is faster than the recursive loop in the reference code
            tree_idx = (tree_idx - 1) // 2
            self.tree[tree_idx] += change

    def get_leaf(self, v):
        parent_idx = 0
        while True:     # the while loop is faster than the method in the reference code
            cl_idx = 2 * parent_idx + 1         # this leaf's left and right kids
            cr_idx = cl_idx + 1
            if cl_idx >= len(self.tree):        # reach bottom, end search
                leaf_idx = parent_idx
                break
            else:       # downward search, always search for a higher priority node
                if v <= self.tree[cl_idx]:
                    parent_idx = cl_idx
                else:
                    v -= self.tree[cl_idx]
                    parent_idx = cr_idx

        data_idx = leaf_idx - self.capacity + 1
        return leaf_idx, self.tree[leaf_idx], self.data[data_idx]
    @property
    def total_p(self):
        return self.tree[0]  # the root

In [None]:
class Memory(object):  # stored as ( s, a, r, s_ ) in SumTree
    epsilon = 0.01  # small amount to avoid zero priority
    alpha = 0.6  # [0~1] convert the importance of TD error to priority
    beta = 0.4  # importance-sampling, from initial value increasing to 1
    beta_increment_per_sampling = 0.001
    abs_err_upper = 1.  # clipped abs error

    def __init__(self, capacity):
        self.tree = SumTree(capacity)
        self.size = 0
    
    def store(self, transition):
        max_p = np.max(self.tree.tree[-self.tree.capacity:])
        if max_p == 0:
            max_p = self.abs_err_upper
        self.tree.add(max_p, transition)   # set the max p for new p
        if self.size < self.tree.capacity:
            self.size += 1
        
    def sample(self, n):
        b_idx, b_memory, ISWeights = np.empty((n,), dtype=np.int32), [None] * n, np.empty(n)
        pri_seg = self.tree.total_p / n       # priority segment
        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])  # max = 1

        max_prob = np.max(self.tree.tree[-self.tree.capacity:]) / self.tree.total_p     # for later calculate ISweight
        for i in range(n):
            a, b = pri_seg * i, pri_seg * (i + 1)
            v = np.random.uniform(a, b)
            idx, p, data = self.tree.get_leaf(v)
            prob = p / self.tree.total_p
            ISWeights[i] = np.power(prob/max_prob, -self.beta)
            b_idx[i], b_memory[i] = idx, data
        return b_idx, b_memory, ISWeights

    def batch_update(self, tree_idx, abs_errors):
        abs_errors += self.epsilon  # convert to abs and avoid 0
        clipped_errors = np.minimum(abs_errors, self.abs_err_upper)
        ps = np.power(clipped_errors, self.alpha)
        for ti, p in zip(tree_idx, ps):
            self.tree.update(ti, p)

In [None]:
img_rows , img_cols = 80, 80
img_channels = 4 #We stack 4 frames
from collections import deque

class DQNAgent:
    def __init__(self):

        self.action_size = 2

        # These are hyper parameters for the DQN
        self.discount_factor = 0.99
        self.learning_rate = 0.00025
        self.epsilon = 0.15
        self.epsilon_min = 0.001
        self.batch_size = 32
        self.train_start = 20000
        self.memory = Memory(capacity=500000)
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_target_model()

    def build_model(self):
        x = Input(shape=(img_rows,img_cols,img_channels))
        shared = Conv2D(32, (8, 8), strides=(4, 4), activation='relu',kernel_initializer=initializers.random_normal(stddev=0.01))(x)
        shared = Conv2D(64, (4, 4), strides=(2, 2), activation='relu',kernel_initializer=initializers.random_normal(stddev=0.01))(shared)
        shared = Conv2D(64, (3, 3), strides=(1, 1), activation='relu',kernel_initializer=initializers.random_normal(stddev=0.01))(shared)
        flatten = Flatten()(shared)
        h = Dense(512,activation='relu',kernel_initializer=initializers.random_normal(stddev=0.01))(flatten)
        y = Dense(self.action_size + 1)(h)
        z = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.max(a[:, 1:], keepdims=True),
                   output_shape=(self.action_size,))(y)
        
        model = Model(input=x, output=z)
        model.summary()
        model.compile(loss='mse', optimizer=optimizers.Adam(lr=self.learning_rate))
        return model

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
        
    # get action from model using epsilon-greedy policy
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            state = state.reshape(1, state.shape[0], state.shape[1], state.shape[2])
            q_value = self.model.predict(state)
            return np.argmax(q_value[0])

    # save sample <s,a,r,s'> to the replay memory
    def append_sample(self, state, action, reward, next_state, done):
        self.memory.store((state, action, reward, next_state, done))

    # pick samples randomly from replay memory (with batch_size)
    def train_model(self,tree_idx,minibatch,ISWeights):
        batch_size = len(minibatch)        
        states = np.zeros((batch_size, img_rows, img_cols, img_channels))
        next_states = np.zeros((batch_size, img_rows, img_cols, img_channels))
        action, reward, done = [], [], []

        for i in range(batch_size):
            states[i] = minibatch[i][0]
            action.append(minibatch[i][1])
            reward.append(minibatch[i][2])
            next_states[i] = minibatch[i][3]
            done.append(minibatch[i][4])

        q_eval = self.model.predict(states)
        q_target = q_eval.copy()
        q_next = self.model.predict(next_states)
        q_next_prime = self.target_model.predict(next_states)
        
        for i in range(batch_size):
            if done[i]:
                q_target[i][action[i]] = reward[i]
            else:
                a = np.argmax(q_next[i])
                q_target[i][action[i]] = reward[i] + self.discount_factor * (
                    q_next_prime[i][a])
        
        abs_errors = np.sum(np.abs(q_target - q_eval), axis=1)
        self.memory.batch_update(tree_idx, abs_errors) 
        self.model.fit(x=states, 
            y=q_target, 
            batch_size=batch_size, 
            epochs=1, 
            verbose=0,  
            sample_weight=ISWeights)

In [None]:
def get_initial_state(img):
    img = cv2.cvtColor(cv2.resize(img, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, x_t = cv2.threshold(img,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
    state = np.stack((x_t, x_t, x_t, x_t), axis=2)
    return state

In [None]:
agent = DQNAgent()
scores, episodes = [], []
action_count = 0
ACTIONS = 2
EXPLORE = 250000
env = gym.make('FlappyBird-v0')
EPISODES = 500000
np.random.seed(123)
num_frames = 0

for e in range(EPISODES):
    done = False
    score = 0
    img = env.reset()
    state = get_initial_state(img)
    
    while not done:
#         env.render()
        action = agent.get_action(state)
        next_img, reward, done,_ = env.step(action)
        
        #process next image
        next_img = cv2.cvtColor(cv2.resize(next_img, (80, 80)), cv2.COLOR_BGR2GRAY)
        ret, next_img = cv2.threshold(next_img,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
        next_img = next_img.reshape(img_rows, img_cols, 1) #80x80x1
        next_state = np.append(next_img,state[:, :, :3],axis=2)
        
        agent.append_sample(state, action, reward, next_state, done)
        
        if agent.memory.size > agent.train_start:
            if agent.epsilon > agent.epsilon_min:
                agent.epsilon -= (agent.epsilon - agent.epsilon_min) / EXPLORE
            tree_idx, minibatch, ISWeights = agent.memory.sample(agent.batch_size)                       
            agent.train_model(tree_idx, minibatch, ISWeights)   
            
        num_frames += 1    
        score += reward
        state = next_state
        
        if num_frames % 2000 == 0:
            agent.update_target_model()
        
        if done:
            scores.append(score)
            episodes.append(e)
            print("episode:", e, " score:", score, " epsilon:", agent.epsilon, " num_frame: ",num_frames)

    if e % 1000:
        agent.model.save('flappy_bird_model.h5')