# DQN (Frozen Lake)
---
### 환경

In [17]:
from keras.layers import Dense
from keras.models import Sequential
import numpy as np
import gym
from gym.envs.registration import register
from keras import optimizers
from keras import initializers
import random
from collections import deque

In [3]:
'''
환경셋팅 한 후에 환경을 추가등록한다.
'''

register(
    id='FrozenLake-v1',
    entry_point="gym.envs.toy_text:FrozenLakeEnv",
    kwargs={'map_name':'4x4','is_slippery':False})

In [4]:
'''
환경 생성
'''
env = gym.make('FrozenLake-v1')

In [48]:
class DQN_Agent:
    def __init__(self, n_state, n_action):
        self.n_state = n_state
        self.n_action = n_action
        
        self.gamma = .95
        self.lr = .01
        self.epsilon = 1
        self.batch_size = 32
        self.train_start = 1000
        
        self.memory = deque(maxlen=2000)
        
        self.model = self.build_model()
        self.target_model = self.build_model()
        
        self.update_target_model()
        
    def build_model(self):
        model = Sequential()
        model.add(Dense(8, input_dim=self.n_state))
        model.add(Dense(16))
        model.add(Dense(6))
        model.add(Dense(self.n_action, activation='sigmoid'))
        
        #model.summary()
        model.compile(loss='mse', optimizer=optimizers.Adam(self.lr))
    
        return model
        
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
        
    def append_sample(self, state, actoin, reward, state_next, done):
        self.memory.append((state, action, reward, state_next, done))
    
    def train_model(self):
        #if(self.epsilon > self.epsi)
        mini_batch = random.sample(self.memory, self.batch_size)
        states = np.zeros((self.batch_size, self.n_state))
        states_next = np.zeros((self.batch_size, self.n_state))
        actions, rewards, dones = [], [], []
        
        for i in range(self.batch_size):
            states[i] = mini_batch[i][0]
            actions.append(mini_batch[i][1])
            rewards.append(mini_batch[i][2])
            states_next[i] = mini_batch[i][3]
            dones.append(mini_batch[i][4])
            
        target = self.model.predict(states)
        target_val = self.target_model.predict(states_next)
        
        for i in range(self.batch_size):
            if dones[i]:
                target[i][actions[i]] = rewards[i]
            else:
                target[i][actions[i]] = rewards[i] + self.gamma * (np.amax(target_val[i]))
                
        self.model.fit(states, target, batch_size=self.batch_size, epochs=1, verbose=0)

In [55]:
episode = 0
max_episode = 300

state = env.reset()
action = env.action_space.sample()
step = 0

agent = DQN_Agent(env.observation_space.n, env.action_space.n)

for ep in range(max_episode):
    
    state = env.reset()
    state = np.eye(16)[state]
    state = np.reshape(state, [1, 16])
    done = False
    agent.epsilon -= .0033
    
    while not done:
        
        # action 선택
        if(random.random() > agent.epsilon):
            action = agent.model.predict(state)
            action = np.argmax(action)
            
        else:
            action = env.action_space.sample()
        
        # step 진행
        state_next, reward, done, _ = env.step(action)
        state_next = np.eye(16)[state_next]
        state_next = np.reshape(state_next, [1, 16])
        # 실패 경우 보상 처리
        if(done and reward < 1):
            reward = -1
        
        # sample 저장
        agent.append_sample(state, action, reward, state_next, done)
        
        # 학습
        if(agent.train_start < len(agent.memory)):
            agent.train_model()
    
env.close()

In [56]:
s = env.reset()
while(True):
    s = np.eye(16)[s]
    s = np.reshape(s, [1, 16])
    a = agent.model.predict(s)
    a = np.argmax(a)
    print(s)
    s, r, d, _ = env.step(a)
    if(d):
        env.render()
        break

[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[1. 0. 0. 0

KeyboardInterrupt: 