<a href="https://colab.research.google.com/github/dantae74/Reinforcement-Learning/blob/main/06-06-DQN-CartPole-in-colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

모두를 위한 머신러닝에서 가져왔습니다.
# CartPole 강화학습 by DQN in Colab

In [None]:
# !sudo apt-get update
# !sudo apt-get install -y xvfb ffmpeg
# !pip install 'imageio==2.4.0'
# !pip install pyvirtualdisplay
# !pip install tf-agents

In [None]:
from  collections import deque
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
import random
import numpy as np
from collections import deque
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam, RMSprop
import pyvirtualdisplay
from tf_agents.environments import suite_gym
import PIL.Image

In [None]:
# Set up a virtual display for rendering OpenAI gym environments.
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()

In [None]:
tf.version.VERSION

In [None]:
MAX_EPISODE = 20000 # @param {type:"integer"}
DISCOUNT_RATE = 0.99  # @param {type:"number"} 
REPLAY_MEMORY = 100000  # @param {type:"integer"}

LEARNING_RATE =  1e-3  # @param {type:"number"}

BATCH_SIZE = 64  # @param {type:"integer"}
TRAIN_START = 1000  # @param {type:"integer"}


# minimum epsilon for epsilon greedy
MIN_E = 0.0 # @param {type:"number"} 
# epsilon will be `MIN_E` at `EPSILON_DECAYING_EPISODE`
EPSILON_DECAYING_EPISODE = MAX_EPISODE * 0.01



In [None]:
# Load the CartPole environment from the OpenAI Gym suite
env_name = 'CartPole-v0'
env = suite_gym.load(env_name)

INPUT_SIZE = env.observation_space.shape[0]
OUTPUT_SIZE = env.action_space.n

In [None]:
print("input_size:", INPUT_SIZE)
print("output_size:", OUTPUT_SIZE)
print("input_size:", env.observation_space.shape)

In [None]:
def OurModel(input_shape, action_space):
    print("start OurModel")

    X_input = Input(input_shape)

    # 'Dense' is the basic form of a neural network layer
    # Input Layer of state size(4) and Hidden Layer with 512 nodes
    X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X_input)

    # Hidden layer with 256 nodes
    X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
    
    # Hidden layer with 64 nodes
    X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)

    # Output Layer with # of actions: 2 nodes (left, right)
    X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)


    # X = Dense(16, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X_input)
    # X = Dense(16, activation="relu", kernel_initializer='he_uniform')(X)
    # X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)

    model = Model(inputs = X_input, outputs = X)
    model.compile(loss="mse", optimizer=RMSprop(learning_rate=LEARNING_RATE, rho=0.95, epsilon=0.01), metrics=["accuracy"])

    model.summary()
    print("End OurModel")
    return model

In [None]:
class DQN:
    def __init__(self, input_size, output_size, name = "main"):
        self.state_size = input_size
        self.action_size = output_size
        self.net_name = name
        
        self.model = OurModel(input_shape=(self.state_size,), action_space=self.action_size)
    
    def predict(self, state):
        return self.model.predict(np.reshape(state, [-1,4]))
    
    def fit(self, state, target):
        self.model.fit(state, target,batch_size=BATCH_SIZE, verbose=0)

In [None]:
def replay(mainDQN, train_batch):

    states = np.vstack([x[0] for x in train_batch])
    actions = np.array([x[1] for x in train_batch])
    rewards = np.array([x[2] for x in train_batch])
    next_states = np.vstack([x[3] for x in train_batch])
    dones = np.array([x[4] for x in train_batch])


    target = mainDQN.predict(states)
    target_next = mainDQN.predict(next_states)

    for i in range(BATCH_SIZE):
      if dones[i]:
        target[i][actions[i]] = rewards[i]
      else:
        target[i][actions[i]] = rewards[i] + DISCOUNT_RATE * np.max(target_next[i])

    mainDQN.fit(np.reshape(states, [-1,4]), target)

In [None]:
def run(mainDQN):
  state = env.reset()
  total_reward = 0

  while True:
    env.render()
    action = np.argmax(mainDQN.predict(state))
    state, reward, done, info = env.step(action)
    total_reward += reward

    if done:
      print("Total score: {}".format(total_reward))
      break

In [None]:
def annealing_epsilon(episode: int, min_e: float, max_e: float, target_episode: int) -> float:

    slope = (min_e - max_e) / (target_episode)
    intercept = max_e

    return max(min_e, slope * episode + intercept)

In [None]:
replay_buffer = deque(maxlen=REPLAY_MEMORY)
last_100_game_reward = deque(maxlen=100)

mainDQN = DQN(INPUT_SIZE, OUTPUT_SIZE)

for episode in range(MAX_EPISODE):
    e = annealing_epsilon(episode, MIN_E, 1.0, EPSILON_DECAYING_EPISODE)
    done = False

    time_step = env.reset()
    PIL.Image.fromarray(env.render())

    state = time_step.observation

    step_count = 0
    while not done:
        
        # if np.random.rand() < e:
        #     action = env.action_space.sample()
        # else:
        #     action = np.argmax(mainDQN.predict(state))

        action = np.argmax(mainDQN.predict(state))

        next_time_step = env.step(action)

        # next_state, reward, done, info = env.step(action)
        next_state = next_time_step.observation
        reward = next_time_step.reward
        done = next_time_step.is_last()

        # if (step_count % 10 == 0):
        #     print("state:", state)
        #     print("next_state:", next_state)
        #     print("reward:", reward)
        #     print("done:", done)
        

        if done:
            reward = -1
        
        replay_buffer.append((state, action, reward, next_state, done))

        state = next_state
        step_count += 1

    if len(replay_buffer) > TRAIN_START:
        minibatch = random.sample(replay_buffer, BATCH_SIZE)
        replay(mainDQN, minibatch)

    if (episode % 10 == 0):
        print("[Episode {:>5}] steps: {:>5} e: {:>5.2f}".format(episode, step_count, e))


    last_100_game_reward.append(step_count)
    if len(last_100_game_reward) == last_100_game_reward.maxlen:
        avg_reward = np.mean(last_100_game_reward)
        if avg_reward > 199.0:
            print("Game Cleared within {} episodes with avg reward {}".format(episode, avg_reward))
            break