In [None]:
import os
import gym
import time
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from collections import deque
from keras.models import Sequential
from keras.layers import Dense , Dropout

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

class DQN():
    def __init__(self, env):
        self.env = env
        self.replayMemory = deque() # 初始化記憶體

        self.OBSERVE = OBSERVE # 觀察階段
        self.EXPLORE = EXPLORE # 探索階段
        self.TRAIN = TRAIN # 訓練階段
        self.EPISODE = EPISODE
        self.STEP = STEP

        self.gamma = gamma
        self.lr = learning_rate 
        self.epsilon = initial_epsilon 
        self.epsilon_min = final_epsilon 
        self.epsilon_decay = epsilon_decay # 遞減率

        self.batch_size = batch_size  # 每次更新時從memory裡取多少記憶出來
        self.memory_size = memory_size  # 記憶上限
        self.learn_steps = learn_steps  # 用來控制什麼時候學習
        self.replace_target_iter = replace_target_iter  # 更換 target net 的步數

        self.action_dim = self.env.action_space.n # action 的維度
        self.state_dim = self.env.observation_space.shape[0] # state 的維度
           
        self.evaluate_model = self.create_network() # 訓練模型
        self.target_model = self.create_network() # 用來預測 Q(S,A) 的目標模型

    def create_network(self):
        model = Sequential()

        model.add(Dense(units = 128, 
                        input_dim = self.state_dim,
                        kernel_initializer = 'random_normal',
                        activation = "relu"))
        model.add(Dense(units = 128,
                        kernel_initializer = 'random_normal',
                        activation = "relu"))
        model.add(Dense(units = 128,
                        kernel_initializer = 'random_normal',
                        activation = "relu"))
        model.add(Dense(units = self.action_dim,
                        kernel_initializer = 'uniform',
                        activation = "linear"))

        model.compile(loss = "mean_squared_error",
                      optimizer = tf.train.AdamOptimizer(self.lr)) 
        return model

    def get_action(self, state):
        # 若learn_steps超過觀察階段則開始遞減
        if self.epsilon > self.epsilon_min and self.learn_steps > OBSERVE: 
            self.epsilon -= self.epsilon_decay

        # action若小於epsilon則隨機抽取，否則取最高分數的action
        action = None
        if np.random.random() < self.epsilon:
            action = self.env.action_space.sample()
        else:
            action = np.argmax(self.evaluate_model.predict(state)[0])
        return action
    
    def train_Q_network(self):  
        #每replace_target_iter次更新一次target_net的參數
        if self.learn_steps % self.replace_target_iter == 0:
            self.target_model.set_weights(self.evaluate_model.get_weights())

        # Step 1: 從記憶體裡隨機抽取樣本
        minibatch = random.sample(self.replayMemory, self.batch_size)

        # Step 2: 計算y
        update_intput = np.zeros((self.batch_size, self.state_dim))
        update_target = np.zeros((self.batch_size, self.action_dim))

        for i in range(self.batch_size):
            state, action, reward, next_state, done = minibatch[i]
            action = np.where(action == np.max(action)) # 找最好的action
            target = self.evaluate_model.predict(state)[0]  
            target_Qhat = self.target_model.predict(next_state)[0]             
            
            # 更新 Q(S,A)
            if done:
                target[action] = reward
            else :
                target[action] = reward + self.gamma * np.max(target_Qhat)
            
            # 儲存 input & output
            update_intput[i] = state
            update_target[i] = target

        # 訓練 evaluate model
        self.evaluate_model.fit(update_intput, update_target, batch_size = self.batch_size, epochs = 1, verbose = 0)
              
    def percieve(self, state , action, reward, next_state, done, episode):

        if len(self.replayMemory) == self.memory_size:
            self.replayMemory.popleft()
            
        one_hot_action = np.zeros(self.action_dim)
        one_hot_action[action] = 1
        
        self.replayMemory.append((state, one_hot_action, reward, next_state, done))
        
        if len(self.replayMemory) > batch_size and (self.learn_steps % 4 == 0) and (self.learn_steps > OBSERVE):
            self.train_Q_network()
        
        self.learn_steps += 1

    def plot_reward(self,Episodes , Reward):
        plt.plot(Episodes, Reward, marker = 'o')
        plt.title('Avg Reward of last 100 episodes')
        plt.ylabel('Reward')
        plt.xlabel('Episodes')
        plt.show()     
        
# Hyper Parameters
OBSERVE = 45000
EXPLORE = 900000 
TRAIN = 24000000
EPISODE = 5000
STEP = int((OBSERVE + EXPLORE + TRAIN)/ EPISODE)
gamma = 0.99
learning_rate = 0.00025
initial_epsilon = 1.0
final_epsilon = 0.1
epsilon_decay = (initial_epsilon - final_epsilon) / EXPLORE 
batch_size = 32  
memory_size = 200000  
learn_steps = 0  
replace_target_iter = 2500 

def main():
    env = gym.make('Breakout-ram-v0')
    agent = DQN(env)
    Reward = []
    Episodes = []
    cost_time = []
    Total_reward = []
    
    for episode in range(1, EPISODE + 1):
        tStart = time.time() # 計時開始

        total_reward = 0
        total_reward_100 = 0

        state = env.reset().reshape(1, 128)
        for step in range(STEP):
            action = agent.get_action(state) 
            next_state, reward, done, _ = env.step(action)
            next_state = next_state.reshape(1, 128)
            total_reward += reward
            agent.percieve(state, action, reward, next_state, done, episode)
            state = next_state

            # 若結束遊戲則跳出
            if done:
                break

        Total_reward.append(total_reward)
        
        if episode % 100 == 0:
            total_reward_100 = sum(Total_reward[episode-100:episode])
            Episodes.append(episode)
            Reward.append(total_reward_100 / 100)

            print('100 Episodes:', int(episode/100), 'Avg Reward of last 100 episodes:', total_reward_100/100)

        tEnd = time.time() # 計時結束
        cost_time.append(tEnd - tStart)
        if episode % 100 == 0:
            print('cost time:', sum(cost_time))
            print('---------------------------------------------------------')
            
    agent.plot_reward(Episodes, Reward)        
    print(f'The average training time per episode is {np.mean(cost_time):.2f} seconds.')
    
if __name__ == '__main__':
    main()

100 Episodes: 0.0 Total Reward is: 0.0
cost time: 0.06499671936035156
-----------------------------------------
100 Episodes: 1.0 Total Reward is: 1.14
cost time: 9.177297592163086
-----------------------------------------
100 Episodes: 2.0 Total Reward is: 2.32
cost time: 18.70869541168213
-----------------------------------------
100 Episodes: 3.0 Total Reward is: 3.76
cost time: 92.50523018836975
-----------------------------------------
100 Episodes: 4.0 Total Reward is: 5.5
cost time: 160.23234939575195
-----------------------------------------
100 Episodes: 5.0 Total Reward is: 7.77
cost time: 237.16179656982422
-----------------------------------------
100 Episodes: 6.0 Total Reward is: 10.59
cost time: 318.02202463150024
-----------------------------------------
