# DQN (Frozen Lake)
---
### 환경
---
* DQN step 마다 학습하지 않고 데이터만 쌓는다.
* 나중에 학습과 타겟 신경망 업데이트 하는것이 최적

In [1]:
from keras.layers import Dense
from keras.models import Sequential
import numpy as np
import gym
from gym.envs.registration import register
from keras import optimizers
from keras import initializers
import random
from collections import deque

Using TensorFlow backend.


In [2]:
'''
환경 생성
'''
env = gym.make('CartPole-v1')

In [16]:
class DQN_Agent:
    def __init__(self, n_state, n_action):
        self.n_state = n_state
        self.n_action = n_action
        
        self.gamma = .95
        self.lr = .01
        self.epsilon = .2
        self.batch_size = 16
        self.train_start = 1000
        
        self.memory = deque()
        
        self.model = self.build_model()
        self.target_model = self.build_model()
        
        self.update_target_model()
        
    def build_model(self):
        model = Sequential()
        model.add(Dense(12, input_dim=env.observation_space.shape[0], activation='tanh'))
        model.add(Dense(8, activation='tanh'))
        #model.add(Dense(8))
        model.add(Dense(env.action_space.n, activation='tanh'))
        
        #model.summary()
        model.compile(loss='mse', optimizer='adam')
    
        return model
        
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
        
    def append_sample(self, state, actoin, reward, state_next, done):
        self.memory.append((state, action, reward, state_next, done))
    
    def train_model(self):
        mini_batch = random.sample(self.memory, self.batch_size)
        states = np.zeros((self.batch_size, self.n_state))
        states_next = np.zeros((self.batch_size, self.n_state))
        actions, rewards, dones = [], [], []
        
        for i in range(self.batch_size):
            states[i] = mini_batch[i][0]
            actions.append(mini_batch[i][1])
            rewards.append(mini_batch[i][2])
            states_next[i] = mini_batch[i][3]
            dones.append(mini_batch[i][4])
            
        target = self.model.predict(states)
        target_val = self.target_model.predict(states_next)
        
        for i in range(self.batch_size):
                target[i][actions[i]] = rewards[i] + self.gamma * (np.argmax(target_val[i]))
                
        self.model.fit(states, target, batch_size=self.batch_size, epochs=1, verbose=0)

In [17]:
episode = 0
max_episode = 1000

state = env.reset()
action = env.action_space.sample()
step = 0

agent = DQN_Agent(env.observation_space.shape[0], env.action_space.n)

for ep in range(max_episode):
    
    state = env.reset()
    state = np.reshape(state, [1, 4])
    done = False
    #agent.epsilon -= .001
    agent.epsilon = 1. / ((episode / 10) + 1)
    
    while not done:
        
        # action 선택
        if(random.random() > agent.epsilon):
            action = agent.model.predict(state)
            action = np.argmax(action)
            
        else:
            action = env.action_space.sample()
        
        # step 진행
        state_next, reward, done, _ = env.step(action)
        state_next = np.reshape(state_next, [1, 4])
        # 실패 경우 보상 처리
        if(done):
            reward = -100
        
        # sample 저장
        agent.append_sample(state, action, reward, state_next, done)
        if(len(agent.memory) > 50000) :
                agent.memory.popleft()
    
    # 10 에피소드마다 학습 및 업데이트
    if(ep % 5 == 0):
        agent.train_model()
        agent.update_target_model()
    
env.close()

ValueError: Sample larger than population or is negative

In [None]:
for ep in range(1000):
    done = False
    s = env.reset()
    reward = 0
    
    while(True):
        #env.render()
        s = np.reshape(s, [1, 4])
        a = agent.model.predict(s)
        #print(a)
        a = np.argmax(a)
        s, r, d, _ = env.step(a)
        reward += r
        if(d):
            print(ep, reward)
            reward = 0
            break
env.close()