## 환경설정

In [0]:
### 환경설정 ###
!rm -rf sample_data
!apt-get install -y xvfb python-opengl > /dev/null 2>&1
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!pip install pyglet==1.3.2
!pip install pygame

### Animation 관련 추가 패키지 ###
!pip install box2d-py mako==1.0.7 JSAnimation imageio

### Code 받아오기 ###
!git clone https://github.com/secury/DS-KAIST-AI-Expert-RL.git

%cd DS-KAIST-AI-Expert-RL/

## Q-Learning

In [0]:
import time

import gym
import envs
import numpy as np

import matplotlib.pyplot as plt
from IPython import display as ipythondisplay

# Library related to Java Script Animation
from matplotlib import animation
from JSAnimation import IPython_display

from pyvirtualdisplay import Display
display = Display(visible=0, size=(400, 300))
display.start()


np.set_printoptions(precision=3, suppress=True, threshold=10000, linewidth=250)

""" Load environment """
# env_name = 'MazeSample3x3-v0'
# env_name = 'MazeSample5x5-v0'
# env_name = 'MazeSample10x10-v0'
# env_name = 'MazeRandom10x10-v0'
# env_name = 'MazeRandom10x10-plus-v0'
# env_name = 'MazeRandom20x20-v0'
# env_name = 'MazeRandom20x20-plus-v0'
env_name = 'MyCartPole-v0'
# env_name = 'MyMountainCar-v0'

env = gym.make(env_name)
env = env.unwrapped
env.T = env.R = None

"""
env.S: the number of states (integer)
env.A: the number of actions (integer)
gamma: discount factor (0 ~ 1)
"""

def plot_movie_js(image_array):
    dpi = 10.0
    xpixels, ypixels = image_array[0].shape[0], image_array[0].shape[1]
    fig = plt.figure(figsize=(ypixels/(dpi), xpixels/(dpi)), dpi=dpi)
    # fig.suptitle(filename, fontsize=160)
    # fig.set_xlabel(filename, fontsize=160)
    # fig.xlabel(filename, fontsize=160)
    im = plt.figimage(image_array[0])

    def animate(i):
        im.set_array(image_array[i])
        return (im,)
    
    anim = animation.FuncAnimation(fig, animate, frames=len(image_array))
    ipythondisplay.display(IPython_display.display_animation(anim))


def epsilon_greedy(Q, s, epsilon=0.1):
    if np.random.rand() < epsilon:
        return np.random.randint(0, env.A)
    else:
        return np.argmax(Q[s, :])


step_size = 0.2

Q = np.zeros((env.S, env.A))
epsilon = 1.0
epsilon_min = 0.1
num_episodes = 1000

for episode in range(num_episodes):
    state = env.reset()
    episode_reward = 0.
    render_list = []

    if episode % 100 == 0 or episode == num_episodes-1:
        print('Episode ' + str(episode) + ':')

    for t in range(10000):
        action = epsilon_greedy(Q, state, epsilon)
        next_state, reward, done, info = env.step(action)

        ###################
        # TODO:
        # Update Q-table
        target_Q = 0
        Q[state, action] = 0
        ###################
        
        episode_reward += reward

        env.draw_policy_evaluation(Q)

        if episode % 100 == 0 or episode == num_episodes-1:
            # print("[epi=%4d,t=%4d] state=%4s / action=%d / reward=%7.4f / next_state=%4s / info=%s / Q[s]=%s" % (episode, t, state, action, reward, next_state, info, Q[state, :]))
            screen = env.render(mode='rgb_array')
            render_list.append(screen)

        if done:
            break
        state = next_state

    epsilon = np.max([epsilon * 0.99, epsilon_min])

    # 100 에피소드 마다 지금까지 학습된 Q-network에 따른 trajectory를 무비클립으로 확인
    if episode % 100 == 0 or episode == num_episodes-1:
        # 마운틴카 실행시 아래 두 라인을 주석 해제 시켜주시고 세번째 라인을 주석처리 시켜주세요.
        # if episode != 0:
        #     plot_movie_js(render_list)
        plot_movie_js(render_list)
        print('[%4d] Episode reward=%.4f / epsilon=%f' % (episode, episode_reward, epsilon))
        print()

    time.sleep(0.1)

# ipythondisplay.clear_output(wait=True)
env.close()