# Policy Gradient (Frozen Lake)
---
## 환경
* state : discrete[16]
* action : discrete[4]
* 목표에 도달 시 +1 점, 구멍에 빠질 때 -1 점, 나머지 0 점.
---
* input : state (array[16])
* output : Q (prob)
---
* 상태를 바꿔넣는데 신경망 출력값이 동일하게 나오는 현상? - 마이너스 보상을 피하려고 맵밖으로 가는 행동때문
* 환경이 좀 별로다. 긍정보상을 받아야 학습이 되는데 긍정보상 받을 확률조차 낮다.
* 너무 안된다. 사이즈가 작으면 오히려 Q - Table 학습이 맞는건가

In [1]:
import numpy as np
import tensorflow as tf
import gym
from gym.envs.registration import register
from keras.layers import Dense
from keras.models import Sequential
from keras import backend as K
from keras.optimizers import Adam
import random

Using TensorFlow backend.


In [2]:
'''
환경셋팅 한 후에 환경을 추가등록한다.
'''

register(
    id='FrozenLake-v1',
    entry_point="gym.envs.toy_text:FrozenLakeEnv",
    kwargs={'map_name':'4x4','is_slippery':False})

In [3]:
'''
환경 생성
'''
env = gym.make('FrozenLake-v1')

In [46]:
size_in = env.observation_space.n
size_out = env.action_space.n
size_w1 = 8
size_w2 = 12
size_w3 = 6
save_file = 'C:\\Users\\김민수\\Documents\\GitHub\\RL\\vars\\cartpole_pg'

lr = .05 ## learning rate
total_episode = 1000
epsilon = 1
gamma = .95  ##Discounted factor
penalty_step = .01

reward = tf.placeholder(tf.float32)
STATE_IN = tf.placeholder(tf.float32, [None, size_in])
W_1 = tf.Variable(tf.random_normal([size_in, size_w1],stddev=.01), name='W_1')
W_2 = tf.Variable(tf.random_normal([size_w1, size_w2],stddev=.01), name='W_2')
W_3 = tf.Variable(tf.random_normal([size_w2, size_w3],stddev=.01), name='W_3')
out = tf.Variable(tf.random_normal([size_w3, size_out],stddev=.01), name='out')

In [47]:
L_1 = tf.matmul(STATE_IN, W_1)
L_2 = tf.matmul(L_1, W_2)
L_3 = tf.matmul(L_2, W_3)
prob = tf.sigmoid(tf.matmul(L_3, out))

loss = -tf.reduce_mean(tf.log(prob) * reward) ##pg는 신경망을 거치면 정책(pi)이 바로 나온다.
train = tf.train.AdamOptimizer(lr).minimize(loss)

In [48]:
def discounted_reward(r):
    dr = np.zeros_like(r)
    sum_r = 0
    
    for i in reversed(range(0, r.size)):
        sum_r = gamma*sum_r + r[i]
        dr[i] = sum_r  
    
    return dr

In [52]:
saver = tf.train.Saver()

with tf.Session() as sess:
    tf.global_variables_initializer().run()
    state_buffer = []
    reward_buffer = []
    step = 0
    ep = 0
    state = env.reset()

    while(total_episode > ep):
        #env.render()
        step += 1
        state = np.eye(16)[state]
        state = np.reshape(state, [1, size_in])

        #상태 정보 stack
        state_buffer.append(state)

        # action 선택
        if(random.random() < epsilon):
            action = env.action_space.sample()
        else:
            pi = sess.run(prob, feed_dict={STATE_IN:state})
            action = np.argmax(pi)

        # step 진행
        new_state, r, d, _ = env.step(action)
        state = new_state
        r -= step * penalty_step

        # 보상 정보 stack
        reward_buffer.append(r) ##보상 정보 stack

        if d:
            # 목적지 도착 시 epsilon 조절
            if(r > 0):
                print(r)
                epsilon = 1 / (ep / (total_episode / 10) + 1)
            else:
                reward_buffer[-1] = -1

            ep += 1

            eps = np.vstack(state_buffer)
            epr = np.vstack(reward_buffer)
            epr = discounted_reward(epr)##데이터 정리
            state_buffer, reward_buffer = [], []

            # 학습
            sess.run(train, feed_dict={STATE_IN:eps, reward:epr})
            step = 0
            state = env.reset()
    
    # 검증
    while(True):
        state = np.eye(16)[state]
        state = np.reshape(state, [1, size_in])
        #print(state)
        pi = sess.run(prob, feed_dict={STATE_IN:state})
        a = np.argmax(pi)
        
        state, r, d, _ = env.step(a)
        
        if(d):
            env.render()
            break
env.close()

0.37


KeyboardInterrupt: 