# Week14 Reinforcemnet Learning

## 1. Q-Learning in a grid world

In [1]:
import numpy as np
import random
from environment import Env
from collections import defaultdict

class QLearningAgent:
    def __init__(self, actions, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.actions = actions        # 0, 1, 2, or 3 (up, down, left, right)
        self.learning_rate = alpha    # alpha
        self.discount_factor = gamma  # gamma
        self.epsilon = epsilon        # ratio of exploration
        self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0])

    # select action in current state - epsilon exploration
    def get_action(self, state):
        if np.random.rand() < self.epsilon:
            # select random action - exploration
            action = np.random.choice(self.actions)
        else:
            # select action a with best Q(s,a) - exploitation
            state_action = self.q_table[state]
            action = self.arg_max(state_action)
        return action


    # 1 step Q-learning - update Q(s,a) from <s,a,r,s'>
    def learn(self, state, action, reward, next_state):
        # Q(s,a)
        q_1 = self.q_table[state][action]
        # r + discount_factor * max(Q(s',a))
        q_2 = reward+self.discount_factor*max(self.q_table[next_state])
        self.q_table[state][action] += self.learning_rate*(q_2-q_1)

    @staticmethod
    def arg_max(state_action):
        max_index_list = []
        max_value = state_action[0]
        for index, value in enumerate(state_action):
            if value > max_value:
                max_index_list.clear()
                max_value = value
                max_index_list.append(index)
            elif value == max_value:
                max_index_list.append(index)
        return random.choice(max_index_list)
    

### No Exploration (𝜀=0)
- 𝛾 = 0.9
- 𝛼 = 1 
- 𝜀 = 0 

In [2]:
env = Env()
agent = QLearningAgent(actions=list(range(env.n_actions)),
                       gamma=0.9,
                       alpha=1,  # Totally replace the q-func value with the current value
                       epsilon=0 # No exploration
                      ) 
n_episode = 100

# Q-learning through episodes
for episode in range(n_episode):
    state = env.reset()     # initial state [0, 0]

    while True:
        env.render()
        
        # select action a in current state s
        action = agent.get_action(str(state))    
        
        # get next state s', reward r, done (True if it reached the final state)
        next_state, reward, done = env.step(action)

        # update Q(s,a) from <s,a,r,s'>
        agent.learn(str(state), action, reward, str(next_state))
        state = next_state

        # display all Q-function values
        env.print_value_all(agent.q_table)
        
        if done:
            break
            

TclError: invalid command name ".!canvas"

### 50% Exploration (𝜀 = 0.5)
- 𝛾 = 0.9
- 𝛼 = 1
- 𝜀 = 0.5

In [3]:
env = Env()
agent = QLearningAgent(actions=list(range(env.n_actions)),
                       gamma=0.9,
                       alpha=1, 
                       epsilon=0.5
                      ) 
n_episode = 100

# Q-learning through episodes
for episode in range(n_episode):
    state = env.reset()     # initial state [0, 0]

    while True:
        env.render()
        
        # select action a in current state s
        action = agent.get_action(str(state))    
        
        # get next state s', reward r, done (True if it reached the final state)
        next_state, reward, done = env.step(action)

        # update Q(s,a) from <s,a,r,s'>
        agent.learn(str(state), action, reward, str(next_state))
        state = next_state

        # display all Q-function values
        env.print_value_all(agent.q_table)
        
        if done:
            break
            

TclError: invalid command name ".!canvas"

### Learning rate (𝛼) 
It means the ratio of the q-func value to be updated with the newly computed q-func value.  
If 𝛼=1, then the q-func value will totally be updated with the newly computed q-func value 
- 𝛾 = 0.9
- 𝛼 = 0.1 
- 𝜀 = 0.1

In [4]:
env = Env()
agent = QLearningAgent(actions=list(range(env.n_actions)),
                       gamma=0.9,
                       alpha=0.1, 
                       epsilon=0.1
                      )
n_episode = 100

# Q-learning through episodes
for episode in range(n_episode):
    state = env.reset()     # initial state [0, 0]

    while True:
        env.render()
        
        # select action a in current state s
        action = agent.get_action(str(state))    
        
        # get next state s', reward r, done (True if it reached the final state)
        next_state, reward, done = env.step(action)

        # update Q(s,a) from <s,a,r,s'>
        agent.learn(str(state), action, reward, str(next_state))
        state = next_state

        # display all Q-function values
        env.print_value_all(agent.q_table)
        
        if done:
            break
            

TclError: invalid command name ".!canvas"

## 2. Deep Q-Network CartPole

In [None]:
import numpy as np
# import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import random
from collections import deque
import dqn

import gym
from typing import List

env = gym.make('CartPole-v1')
#env = gym.wrappers.Monitor(env, directory="gym-results/", force=True)

# Constants defining our neural network
INPUT_SIZE = env.observation_space.shape[0]
OUTPUT_SIZE = env.action_space.n

DISCOUNT_RATE = 0.99
REPLAY_MEMORY = 50000
BATCH_SIZE = 64
TARGET_UPDATE_FREQUENCY = 5
MAX_EPISODES = 5000


def replay_train(mainDQN: dqn.DQN, targetDQN: dqn.DQN, train_batch: list) -> float:
    """Trains `mainDQN` with target Q values given by `targetDQN`
    Args:
        mainDQN (dqn.DQN): Main DQN that will be trained
        targetDQN (dqn.DQN): Target DQN that will predict Q_target
        train_batch (list): Minibatch of replay memory
            Each element is (s, a, r, s', done)
            [(state, action, reward, next_state, done), ...]
    Returns:
        float: After updating `mainDQN`, it returns a `loss`
    """
    states = np.vstack([x[0] for x in train_batch])
    actions = np.array([x[1] for x in train_batch])
    rewards = np.array([x[2] for x in train_batch])
    next_states = np.vstack([x[3] for x in train_batch])
    done = np.array([x[4] for x in train_batch])

    X = states

    # target
    Q_target = rewards + DISCOUNT_RATE * np.max(targetDQN.predict(next_states), axis=1) * ~done

    # prediction
    y = mainDQN.predict(states)
    y[np.arange(len(X)), actions] = Q_target

    # Train our network using target and predicted Q values on each episode
    return mainDQN.update(X, y)


def get_copy_var_ops(*, dest_scope_name: str, src_scope_name: str) -> List[tf.Operation]:
    """Creates TF operations that copy weights from `src_scope` to `dest_scope`
    Args:
        dest_scope_name (str): Destination weights (copy to)
        src_scope_name (str): Source weight (copy from)
    Returns:
        List[tf.Operation]: Update operations are created and returned
    """
    # Copy variables src_scope to dest_scope
    op_holder = []
    src_vars = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES, scope=src_scope_name)
    dest_vars = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES, scope=dest_scope_name)

    for src_var, dest_var in zip(src_vars, dest_vars):
        # Store the operation to update the main weight to the target weight in the op_holder list
        op_holder.append(dest_var.assign(src_var.value()))

    return op_holder


def bot_play(mainDQN: dqn.DQN, env: gym.Env) -> None:
    """Test runs with rendering and prints the total score
    Args:
        mainDQN (dqn.DQN): DQN agent to run a test
        env (gym.Env): Gym Environment
    """
    state = env.reset()
    reward_sum = 0

    while True:

        env.render()
        action = np.argmax(mainDQN.predict(state))
        state, reward, done, _ = env.step(action)
        reward_sum += reward

        if done:
            print("Total score: {}".format(reward_sum))
            break


def main():
    # store the previous observations in replay memory
    replay_buffer = deque(maxlen=REPLAY_MEMORY)

    last_100_game_reward = deque(maxlen=100)

    with tf.Session() as sess:
        mainDQN = dqn.DQN(sess, INPUT_SIZE, OUTPUT_SIZE, name="main")
        targetDQN = dqn.DQN(sess, INPUT_SIZE, OUTPUT_SIZE, name="target")
        sess.run(tf.global_variables_initializer())

        # initial copy q_net -> target_net
        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="main")
        # Run update operations
        sess.run(copy_ops)

        for episode in range(MAX_EPISODES):
            
            # epsilon decay
            e = 1. / ((episode / 10) + 1)
            score = 0
            done = False
            step_count = 0
            state = env.reset()

            while not done:
                env.render()

                if np.random.rand() < e:
                    action = env.action_space.sample()
                else:
                    # Choose an action by greedily from the Q-network
                    action = np.argmax(mainDQN.predict(state))

                # Get new state and reward from environment
                next_state, reward, done, _ = env.step(action)

                if done:  # Penalty
                    reward = -1

                # Save the experience to our buffer
                replay_buffer.append((state, action, reward, next_state, done))

                if len(replay_buffer) > BATCH_SIZE:
                    minibatch = random.sample(replay_buffer, BATCH_SIZE)
                    loss, _ = replay_train(mainDQN, targetDQN, minibatch)

                # Update main to target every 5 times
                if step_count % TARGET_UPDATE_FREQUENCY == 0:
                    sess.run(copy_ops)
                score += reward
                state = next_state
                step_count += 1

            print("Episode: {} steps: {} reward: {}".format(episode, step_count, score))

            # CartPole-v0 Game Clear Checking Logic
            last_100_game_reward.append(score)

            if len(last_100_game_reward) == last_100_game_reward.maxlen:
                avg_reward = np.mean(last_100_game_reward)

                if avg_reward > 199:
                    print("Game Cleared in {episode} episodes with avg reward {avg_reward}")
                    break


if __name__ == "__main__":
    main()

Instructions for updating:
non-resource variables are not supported in the long term


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "E:\Anaconda3\envs\TF\lib\site-packages\IPython\core\interactiveshell.py", line 3437, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-5-a80792520bd0>", line 9, in <module>
    import gym
ModuleNotFoundError: No module named 'gym'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "E:\Anaconda3\envs\TF\lib\site-packages\IPython\core\interactiveshell.py", line 2061, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'ModuleNotFoundError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "E:\Anaconda3\envs\TF\lib\site-packages\IPython\core\ultratb.py", line 1101, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "E:\Anaconda3\envs\TF\lib\site-packages\IPython\core\ultratb.py"