# REINFORCE : Monte-Carlo Policy Gradient

![좋은거](https://leimao.github.io/images/article/2017-05-04-REINFORCE-Policy-Gradient/Sutton_REINFORCE.png)

In [1]:
%%time
## 약 25초 ~30초 소요
!pip install pyvirtualdisplay 
!apt-get install -y xvfb python-opengl ffmpeg
!pip install gym
!pip install box2d-py
#!pip install pyglet==1.3.2
!pip install pyglet

Collecting pyvirtualdisplay
  Downloading https://files.pythonhosted.org/packages/ad/05/6568620fed440941b704664b9cfe5f836ad699ac7694745e7787fbdc8063/PyVirtualDisplay-2.0-py2.py3-none-any.whl
Collecting EasyProcess
  Downloading https://files.pythonhosted.org/packages/48/3c/75573613641c90c6d094059ac28adb748560d99bd27ee6f80cce398f404e/EasyProcess-0.3-py2.py3-none-any.whl
Installing collected packages: EasyProcess, pyvirtualdisplay
Successfully installed EasyProcess-0.3 pyvirtualdisplay-2.0
Reading package lists... Done
Building dependency tree       
Reading state information... Done
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
Suggested packages:
  libgle3
The following NEW packages will be installed:
  python-opengl xvfb
0 upgraded, 2 newly installed, 0 to remove and 13 not upgraded.
Need to get 1,280 kB of archives.
After this operation, 7,686 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 python-opengl all 3.1.0+df

In [2]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display

In [3]:
display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7f6431647e50>

In [4]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[-1]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

# Lunar Lander

In [5]:
env = wrap_env(gym.make("LunarLander-v2"))
print('observation space:', env.observation_space)
print('action space:', env.action_space)

state = env.reset()
for t in range(1000):
    action = env.action_space.sample() # your agent here (this takes random actions)
    env.render()
    observation, reward, done, info = env.step(action)
    if done: 
      break;
            
print('steps: ', t)
env.close()
show_video()

observation space: Box(-inf, inf, (8,), float32)
action space: Discrete(4)
steps:  69


In [6]:
print("state 수는? : ", env.observation_space.shape)
print("action 수는? : ", env.action_space.n)

state 수는? :  (8,)
action 수는? :  4


In [7]:
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Input, Dense, Add, Subtract, Average, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.utils import plot_model

# Policy Network

1. 히든레이어는 자유롭게 구성하시오.
    * 너무 과하거나 모자라면 엄청 오래걸림
2. policy(a|s)를 구현하면 됨
    * 인풋이 state
    * 아웃풋은 액션별 확률 ('softmax')

In [8]:
keras.backend.clear_session()
######################
### Your Code Here ###
######################

state_layer = Input(shape=(8,))
hidden = Dense(128, activation='relu')(state_layer)
hidden = Dense(80, activation='relu')(hidden)
policy_layer = Dense(4, activation='softmax')(hidden)

policy_network = Model(state_layer, policy_layer)

policy_network.compile(loss = 'categorical_crossentropy',
              optimizer = Adam(0.01))

policy_network.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 8)]               0         
_________________________________________________________________
dense (Dense)                (None, 128)               1152      
_________________________________________________________________
dense_1 (Dense)              (None, 80)                10320     
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 324       
Total params: 11,796
Trainable params: 11,796
Non-trainable params: 0
_________________________________________________________________


# Policy를 1회 학습시켜보자.

### 1. policy network를 이용하여 한번의 episode를 기록한다.

In [9]:
state_history = []
action_history = []
reward_history = []

for i in range(1):
    env = wrap_env(gym.make("LunarLander-v2"))
    s0 = env.reset()
    done = False

    cum_r = 0
    time_step = 0
    while True :
        time_step = time_step + 1

        s0 = s0.reshape([1, -1])
        policy = policy_network.predict(s0).reshape(-1)
        a0 = np.random.choice(env.action_space.n, p = policy)
        s1, r1, done, _ = env.step(a0)
        
        a0 = np.eye(env.action_space.n)[a0].reshape([1,-1])
        
        state_history.append(s0)
        action_history.append(a0)
        reward_history.append(r1)

        if time_step % 50 == 0 :
            print(" TimeStep : {} 진행중".format(time_step))

        if done == True :
            env.close()
            show_video()
            break
        s0 = s1

 TimeStep : 50 진행중


# 학습을 진행!

# Return Normalization!

In [10]:
gamma = 0.99  # Discount factor for past rewards
num_inputs = env.observation_space.shape[0]
num_actions = env.action_space.n
eps = np.finfo(np.float32).eps.item()  ## 매우 작은 수.

In [11]:
returns = []
Gt = 0
for r in reward_history[::-1] : # 거꾸로 뽑는다.
    Gt = r + gamma*Gt
    returns.insert(0,  Gt) # append를 0번 인덱스부터 하는 것!

In [12]:
# Normalize
returns = np.array(returns)
returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
returns = returns.reshape([-1,1])
returns.shape

(97, 1)

In [13]:
state = np.vstack(state_history)
state.shape

(97, 8)

In [14]:
action_dummy = np.vstack(action_history)
action_dummy.shape

(97, 4)

In [15]:
G_target = action_dummy*returns

In [16]:
policy_network.fit(state, G_target, verbose=0, epochs=1, batch_size=100000)

<tensorflow.python.keras.callbacks.History at 0x7f63e6d376d0>

# Episode 1000회 학습을 시켜보자!

In [17]:
from collections import deque
memory = deque(maxlen=5)

In [None]:
n_episode = 2000
gamma = 0.99
rewards_cum = []

for episode in range(n_episode):
    print('--------{}---------'.format(episode))
    r_c = 0
    #### 에피소드 생성
    state_history = []
    action_history = []
    reward_history = []

    for i in range(1):
        env = wrap_env(gym.make("LunarLander-v2"))
        s0 = env.reset()
        done = False

        time_step = 0
        while True :
            time_step = time_step + 1

            s0 = s0.reshape([1, -1])
            policy = policy_network.predict(s0).reshape(-1)
            a0 = np.random.choice(env.action_space.n, p = policy)
            s1, r1, done, _ = env.step(a0)
            
            a0 = np.eye(env.action_space.n)[a0].reshape([1,-1])
            
            state_history.append(s0)
            action_history.append(a0)
            reward_history.append(r1)

            # if time_step % 100 == 0 :
            #     print(" TimeStep : {} 진행중".format(time_step))

            if done == True :
                env.close()
                # show_video()
                break
            s0 = s1
    rewards_cum.append(sum(reward_history))
    #### 학습
    memory.append((state_history, action_history, reward_history))
    for history in memory :
        state_history = history[0]
        action_history = history[1]
        reward_history = history[2]

        returns = []
        Gt = 0
        for r in reward_history[::-1] : # 거꾸로 뽑는다.
            Gt = r + gamma*Gt
            returns.insert(0,  Gt) # append를 0번 인덱스부터 하는 것!
        # Normalize
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        returns = returns.reshape([-1,1])
        returns.shape

        state = np.vstack(state_history)
        action_dummy = np.vstack(action_history)
        G_target = action_dummy*returns
        policy_network.fit(state, G_target, verbose=0, epochs=1, batch_size=100000)

    if (episode+1) % 10 == 0 :
        print("{}번째 episode 학습 완료".format(episode+1))
        print(policy)
        show_video()
        plt.plot(rewards_cum)
        plt.show()
