In [0]:
!apt-get install -y xvfb python-opengl > /dev/null 2>&1
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!pip install pyglet==1.3.2

### Animation 관련 추가 패키지 ###
!pip install box2d-py mako==1.0.7 Pygame JSAnimation imageio

In [0]:
!git clone https://github.com/secury/DS-KAIST-AI-Expert-RL.git
!pip install pygame
%cd DS-KAIST-AI-Expert-RL/

In [0]:
import time

import gym
import envs
import numpy as np

# Library related to Java Script Animation
from matplotlib import animation
from JSAnimation import IPython_display

import matplotlib.pyplot as plt
from IPython import display as ipythondisplay

In [0]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(400, 300))
display.start()

In [0]:
np.set_printoptions(precision=3, suppress=True, threshold=10000, linewidth=250)

""" Load environment """
env_name = 'MazeSample5x5-v0'
# env_name = 'MazeSample10x10-v0'
# env_name = 'MazeRandom10x10-v0'
# env_name = 'MazeRandom10x10-plus-v0'
# env_name = 'MazeRandom20x20-v0'
# env_name = 'MazeRandom20x20-plus-v0'
# env_name = 'MyCartPole-v0'
# env_name = 'MyMountainCar-v0'

env = gym.make(env_name)
env = env.unwrapped
"""
env.S: the number of states (integer)
env.A: the number of actions (integer)
env.T: transition matrix (S x A x S)-sized array
env.R: reward matrix (S x A)-sized array
env.gamma: discount factor (0 ~ 1)
"""
        

def plot_movie_js(image_array):
    dpi = 10.0
    xpixels, ypixels = image_array[0].shape[0], image_array[0].shape[1]
    fig = plt.figure(figsize=(ypixels/(dpi), xpixels/(dpi)), dpi=dpi)
    # fig.suptitle(filename, fontsize=160)
    # fig.set_xlabel(filename, fontsize=160)
    # fig.xlabel(filename, fontsize=160)
    im = plt.figimage(image_array[0])

    def animate(i):
        im.set_array(image_array[i])
        return (im,)
    
    anim = animation.FuncAnimation(fig, animate, frames=len(image_array))
    ipythondisplay.display(IPython_display.display_animation(anim))


def policy_evaluation(env, pi):
    """
    :param env: MDP(S, A, T, R, gamma)
    :param pi: behavior policy (S x A)-sized array
    :return: V, Q where V is (S)-sized array and Q is (S x A)-sized array
    """
    r = np.sum(env.R * pi, axis=1)
    P = np.tensordot(pi, env.T, axes=([1], [1]))[np.arange(env.S), np.arange(env.S), :]
    V = np.linalg.inv(np.eye(env.S) - env.gamma * P).dot(r)
    Q = env.R + env.gamma * env.T.dot(V)

    return V, Q


def policy_improvement(env, Q):
    pi = np.zeros((env.S, env.A))
    pi[np.arange(env.S), np.argmax(Q, axis=1)] = 1.
    return pi


def policy_iteration(env):
    pi = np.ones((env.S, env.A)) / env.A
    for i in range(1000):
        V, Q = policy_evaluation(env, pi)
        new_pi = policy_improvement(env, Q)
        if np.all(pi == new_pi):
            break
        pi = new_pi
    return pi, Q


def value_iteration(env):
    V, Q = np.zeros(env.S), np.zeros((env.S, env.A))
    for i in range(1000):
        Q_new = env.R + env.gamma * env.T.dot(V)
        V_new = np.max(Q_new, axis=1)
        if np.max(np.abs(V - V_new)) < 1e-6:
            break
        V, Q = V_new, Q_new

    pi = np.zeros((env.S, env.A))
    pi[np.arange(env.S), np.argmax(Q, axis=1)] = 1.

    return pi, Q


pi, Q = policy_iteration(env)

for episode in range(1):
    state = env.reset()
    render_list = []
    episode_reward = 0.

    for t in range(10000):
        action = int(np.random.choice(np.arange(env.A), p=pi[state, :]))
        state1, reward, done, info = env.step(action)
        episode_reward += reward
        print("[%4d] state=%4s / action=%d / reward=%7.4f / state1=%4s / info=%s" % (t, state, action, reward, state1, info))

        env.draw_policy_evaluation(Q, pi)  # 필요시 주석 처리
        screen = env.render(mode='rgb_array')
        render_list.append(screen)
        time.sleep(0.3 if 'Maze' in env_name else 0.01)

        if done:
            plot_movie_js(render_list)
            break
        state = state1
    print('Episode reward: %.4f' % episode_reward)

    time.sleep(1)
time.sleep(10)