# Deep Q Network 여행 연습
# Part I. Function Approximation : Q-Network

![좋은 그림](https://media.springernature.com/full/springer-static/image/art%3A10.1038%2Fnature14236/MediaObjects/41586_2015_Article_BFnature14236_Fig1_HTML.jpg)

[읽어보면 좋은 것 1](https://arxiv.org/pdf/1312.5602.pdf)<br>
[읽어보면 좋은 것 2](https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf)



# 라이브러리 설치 / 불러오기

In [1]:
%%time
## 약 25초 ~30초 소요
!pip install pyvirtualdisplay 
!apt-get install -y xvfb python-opengl ffmpeg
!pip install gym
!pip install box2d-py
#!pip install pyglet==1.3.2
!pip install pyglet

Collecting pyvirtualdisplay
  Downloading https://files.pythonhosted.org/packages/ad/05/6568620fed440941b704664b9cfe5f836ad699ac7694745e7787fbdc8063/PyVirtualDisplay-2.0-py2.py3-none-any.whl
Collecting EasyProcess
  Downloading https://files.pythonhosted.org/packages/48/3c/75573613641c90c6d094059ac28adb748560d99bd27ee6f80cce398f404e/EasyProcess-0.3-py2.py3-none-any.whl
Installing collected packages: EasyProcess, pyvirtualdisplay
Successfully installed EasyProcess-0.3 pyvirtualdisplay-2.0
Reading package lists... Done
Building dependency tree       
Reading state information... Done
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
Suggested packages:
  libgle3
The following NEW packages will be installed:
  python-opengl xvfb
0 upgraded, 2 newly installed, 0 to remove and 10 not upgraded.
Need to get 1,280 kB of archives.
After this operation, 7,686 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 python-opengl all 3.1.0+df

In [2]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display

In [3]:
display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7f846ff023c8>

비디오 녹화용 함수

In [4]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[-1]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

# CartPole

[CartPole 링크](https://gym.openai.com/envs/CartPole-v1/)
<br>
<img src='https://inspaceai.github.io/images/lhh/cartpole_rl_compare/cartpole.gif'>

In [5]:
env = wrap_env(gym.make("CartPole-v1"))
print('observation space:', env.observation_space)
print('action space:', env.action_space)

state = env.reset()
for t in range(1000):
    action = env.action_space.sample() # your agent here (this takes random actions)
    env.render()
    observation, reward, done, info = env.step(action)
    if done: 
      break;
            
print('steps: ', t)
env.close()
show_video()

observation space: Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)
action space: Discrete(2)
steps:  29


# Deep Neural Network for Q-function

**Q-function기능을 할 뉴럴넷을 구성할 것이다.**
1. input은 state다. (노드 수는?)
2. output은 그 state에서 취할 수 있는 action에 대한 Q값이다. (노드 수는?)

![좋은그림](https://d2908q01vomqb2.cloudfront.net/f1f836cb4ea6efb2a0b1b99f41ad8b103eff4b59/2019/11/20/Fig2-DeepRL-SageMaker.gif)

### State와 Action 정의
* Observation:
        Type: Box(4)
        Num     Observation               Min                     Max
        0       Cart Position             -4.8                    4.8
        1       Cart Velocity             -Inf                    Inf
        2       Pole Angle                -0.418 rad (-24 deg)    0.418 rad (24 deg)
        3       Pole Angular Velocity     -Inf                    Inf
* Actions:
        Type: Discrete(2)
        Num   Action
        0     Push cart to the left
        1     Push cart to the right

In [7]:
print("state 수는? : ", env.observation_space.shape)
print("action 수는? : ", env.action_space.n)

state 수는? :  (4,)
action 수는? :  2


**다음과 같이 구성하시오.**
1. Q값에 대한 회귀 문제이다! (아웃풋레이어의 activation은?)
2. 히든레이어는 2개를 구성한다.(각각 노드 32, 24개씩)
3. 컴파일 까지!
4. **Functional**하게!  (너무 힘들면 Sequential하게)

In [8]:
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [9]:
keras.backend.clear_session()
######################
### Your Code Here ###
######################

state_layer = Input(shape=[ env.observation_space.shape[0] ]) #  env.observation_space.shape[0] --> 8
Hidden_layer = Dense(64, activation='swish')(state_layer)
Hidden_layer = Dense(64, activation='swish')(Hidden_layer)
q_layer = Dense(env.action_space.n, activation='softmax')(Hidden_layer)  # env.action_space.n --> 4

Q_network = Model(state_layer, q_layer)

Q_network.compile(loss = 'mse',
              optimizer = Adam())

Q_network.summary()


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 4)]               0         
_________________________________________________________________
dense (Dense)                (None, 64)                320       
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 130       
Total params: 4,610
Trainable params: 4,610
Non-trainable params: 0
_________________________________________________________________


# Q - Learning

**00_2_SARSA & Qlearning 파일 참고하며 볼 것!**

1. 흐름을 정리하며 봐야 가장 좋다!

In [10]:
env.close()

alpha = 0.1
gamma = 0.999
n_episod = 2000
epsilon = 0.1

cum_rewards = []

for i in range(n_episod) :
    print("episode {} --진행 중".format(i+1))
    env = wrap_env(gym.make("CartPole-v1"))    
    s0 = env.reset()
    s0 = s0.reshape([1, -1]) # 2차원 어레이로 바꿔주기
    done = False
    cum_r = 0
    while True :
        Q_s0 = Q_network.predict(s0) #s0에서의 action들의 Q_value
        # 행동 선택하기 e-greedy 방법
        # a0 를 선택하자!
        if np.random.uniform() < epsilon :
          a0 = env.action_space.sample()
        else :
          a0 = np.argmax(Q_s0)

        # 환경과 상호작용!
        # a0를 이용하여 s1, r1, done, _ 를 받자!
        s1, r1, done, _ = env.step(a0)
        s1 = s1.reshape([1,-1]) # 2차원 어레이로 바꿔주기

        # update Q 제작! 엄청 중요!
        # Q_s0[a0] 만 업데이트가 일어나도록 update용 Q_value를 제작한다.
        Q_s1 = Q_network.predict(s1)
        update_Q = Q_s0.copy() # Q_s0와 다 똑같지만
        # 딱 Q[s0, a0] 자리에, r1 + 할인율*Q(s1)의 max! (1-done)도 곱해주자!
        # update_Q를 완성하는 코드를 제작하자.
        update_Q[0][a0] = r1 + gamma*np.max(Q_s1)*(1-done)
        ###### Q-table에서 업데이트는 ? #############################################
        ## Q[s0, a0] = Q[s0, a0] + alpha * (r1 + gamma*np.max(Q[s1,:]) - Q[s0, a0]) #
        ## w <-- w + lr * 미분(mean(squart(y - y^))) 유추해서 해석하면
        ## r1 + gamma*np.max(Q[s1,:]) : Y
        ## Q[s0, a0] : Y^ 
        #############################################################################

        Q_network.fit(s0, update_Q, epochs=1, verbose=0)
        
        cum_r = cum_r + r1

        if done == True : # 종료 되었다면
            cum_rewards.append(cum_r)
            env.close() # 환경닫고
            break # 멈추자.

        s0 = s1 # 다음 루프에선 이것이 직전 state

    if (i+1) % 5 == 0 :
            print('===========  에피소드 : {}  ============'.format(i+1))
            print('최종 누적 보상 :',cum_r)
            print(a0, Q_s0)
            plt.plot(cum_rewards)
            plt.show()
            show_video()
        

Output hidden; open in https://colab.research.google.com to view.