In [1]:
import gym
import math
import random
import keras
import os
import tensorflow as tf
import numpy as np
import scipy as sp
import sklearn as sk
import matplotlib as mpl
import matplotlib.pyplot as plt

from tqdm import tqdm
from skimage.color import rgb2gray
from skimage.transform import rescale
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import *
from keras.callbacks import *
%matplotlib inline

Using TensorFlow backend.


In [2]:
# Setting Global Parameters

params = {}
params['gamma'] = 0.999
params['eps_min'] = 0.10
params['eps_max'] = 0.50
params['decay'] = 1e-4
params['batch_size'] = 32
params['lr'] = 2e-4
params['capacity'] = 10000


class DQN_Agent():    
    
    """   
    Attributes:
    num_state:  number of states under state space
    num_action: number of actions under the action space
    gamma:  discount factor   
    params: parameters for Q-network    
    
    """
    
    def __init__(self, num_state, num_action, params):
        
        self.params =  params
        self.num_state = num_state
        self.num_action = num_action
        self.steps = 0
        
        self.gamma = params['gamma']   
        self.eps_min = params['eps_min']
        self.eps_max = params['eps_max']
        self.eps = params['eps_max']
        self.decay = params['decay']
        self.learning_rate = params['lr']
        self.batch_size = params['batch_size']
        self.capacity = params['capacity']
        self.model = self._create_model()
        self.memory=[]
        
    # Model OK    
    def _create_model(self):
        model =  Sequential()
        
        model.add(Conv2D(input_shape = self.num_state, filters = 48, kernel_size = 8, strides=2, 
                         activation = 'relu'))
        
        model.add(Conv2D(filters = 96, kernel_size = 4, strides=2, 
                         activation = 'relu'))
        
        model.add(Flatten()) 
        model.add(Dense(units=256, kernel_initializer='glorot_normal')) 
        model.add(Dense(self.num_action))              
                 
        optimizer = Adam()
        model.compile(optimizer = optimizer, loss = 'mse')
        
        print("Model constructed...", end ="\r", flush=True)
                 
        return model   
    
    # Check OK
    def predict(self, state):
        return self.model.predict(state)
    
    # Check OK
    def predict_one(self, state):        
        state = np.expand_dims(state, axis=0)        
        return self.model.predict(state)
    
    # Check OK
    def observe(self, state, action, next_state, reward, done):
        self.memory.append((state, action, next_state, reward, done))    
        
        if len(self.memory) > self.capacity:
            self.memory.pop(0)
            
        self.eps = self.eps_min + (self.eps_max - self.eps_min) * math.exp(-self.steps * self.decay)        
        self.steps +=1        
        
    # Check OK    
    def act(self, state):      
        if np.random.rand() < self.eps:
            action = np.random.choice(self.num_action)
        else:            
            action = np.argmax(self.predict_one(state))                       
        return action  
       
    def replay(self):
        batch = np.array(random.sample(self.memory, min(self.batch_size, len(self.memory))))
        batch_len = len(batch)
        
        states = np.array([batch[i][0] for i in range(batch_len)])       
        next_states = np.array([(np.zeros(self.num_state) if episode[2] is None 
                                 else episode[2]) for episode in batch])
        action = np.array([batch[i][1] for i in range(batch_len)])
        reward = np.array([batch[i][3] for i in range(batch_len)])
        done = np.array([batch[i][4] for i in range(batch_len)])
                     
        q = self.predict(states)
        q_new = self.predict(next_states)
        
        y = np.zeros((batch_len, self.num_action))        
        
        for i in range(batch_len):
            target = q[i]
            if done[i]:
                target[action] = reward[i]
            else:
                target[action] = reward[i] + self.gamma*np.max(q_new[i]) 
            y[i] = target
            
        history = self.model.fit(states, y, verbose = 0, epochs=1) 
        mean_action_value = np.mean(np.mean(q))
        
        return history, mean_action_value
    
           
    def train(self, state, action, next_state, reward, done):        
        new_q = reward if done else reward +  self.gamma * np.max(self.predict_one(next_state))
        old_q = self.predict_one(state)
        old_q[0][action] = new_q
        state = np.expand_dims(state, axis = 0)        
        history = self.model.fit(state, new_q, verbose = 0, epochs=1)        
        return history
    

class input_pipeline():    
    
    def __init__(self, state):       
        self.history_length = 4        
        self.input_x=[]
        self.input_x = [self._preprocess(state) for i in range(4)]
        self.x = np.moveaxis(np.array(self.input_x), 0, -1) 

    def _preprocess(self, state):    
        state = state[30:195,7:154, :]
        state = np.mean(state, axis = 2)
        state = state[::2, ::2]
        state[state==162] = 80
        state[state==180] = 90
        state[state==198] = 100
        state[state==200] = 110   
        return state / 255
    
    def update_state(self, state):
        self.input_x.pop(0)
        self.input_x.append(self._preprocess(state))
        self.x = np.moveaxis(np.array(self.input_x), 0, -1)
    


In [None]:
def main():
    
    env = gym.make('Breakout-v0')
    num_state = env.observation_space.shape    
    num_action = env.action_space.n
    
    agent = DQN_Agent((83,74,4), num_action, params) 
    
    
    loss, mean_av, render = [], [], False
    
    
    for episode in range(N_EPISODE):
        
        state, step, done = env.reset(), 0, False
        pipeline = input_pipeline(state)
        x = np.zeros_like(pipeline.x)
        
        
        if episode > 0:
            render = True
        
        while not done:
            if render:
                env.render()
                
            action = agent.act(pipeline.x)            
            next_state, reward, done, info = env.step(action)            
            
            x = pipeline.x
            pipeline.update_state(next_state)
            x_new = pipeline.x  
            

            agent.observe(x, action, x_new, reward, done)
            history, mean_action_value = agent.replay()
            
            
            
#             loss.append(history.history['loss'])
#             mean_av.append(mean_action_value)
           
            step +=1
        

        if episode % 10==0:
            agent.model.save('dqn.h5')    
            
        print('Episode {}/{}, Step:{}'.format(episode+1, N_EPISODE, step), end = '\r', flush = True) 
        prev_x = None
            
       

In [None]:
N_EPISODE = 10000
main()

[2017-11-20 09:44:33,554] Making new env: Breakout-v0


Episode 146/10000, Step:386