# Laboratorium 5 (4 pkt)

Celem czwartego laboratorium jest zapoznanie się oraz zaimplementowanie algorytmów głębokiego uczenia aktywnego. Zaimplementowane algorytmy będą testowane z wykorzystaniem środowiska z OpenAI - *CartPole*.


Dołączenie standardowych bibliotek

In [1]:
from collections import deque
import gym
import numpy as np
import random

from gym.envs.classic_control import CartPoleEnv

Dołączenie bibliotek do obsługi sieci neuronowych

In [2]:
%tensorflow_version 1.x
from keras import Model
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam

TensorFlow 1.x selected.


Using TensorFlow backend.


## Zadanie 1 - Double Deep Q-Network

<p style='text-align: justify;'>
Celem ćwiczenie jest zaimplementowanie algorytmu Double Deep Q-Network. Wartoscią oczekiwaną sieci jest:
\begin{equation}
       Q^*(s, a) \approx r + \gamma argmax_{a'}Q_\theta'(s', a') 
\end{equation}
a wagi pomiędzy sieciami wymieniane są co dziesięć aktualizacji wag sieci sterującej poczynaniami agenta ($Q$).
</p>

In [3]:
class DDQNAgent:
    def __init__(self, action_size, learning_rate, model, target_model, get_legal_actions=None, env=None):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 0.5  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.999
        self.tau = 0.85
        self.learning_rate = learning_rate
        self.model = model
        self.target_model = target_model
        self.get_legal_actions = get_legal_actions
        self.update_weights()
        self.replay_counter = 1
        self.env = env

    def remember(self, state, action, reward, next_state, done):
        #Function adds information to the memory about last action and its results
        self.memory.append((state, action, reward, next_state, done)) 

    def get_action(self, state):
        """
        Compute the action to take in the current state, including exploration.
        With probability self.epsilon, we should take a random action.
            otherwise - the best policy action (self.get_best_action).

        Note: To pick randomly from a list, use random.choice(list).
              To pick True or False with a given probablity, generate uniform number in [0, 1]
              and compare it with your probability
        """

        #
        # INSERT CODE HERE to get action in a given state (according to epsilon greedy algorithm)
        #        
        
        epsilon = self.epsilon

        # Pick Action
        if isinstance(env, CartPoleEnv):
            if np.random.random() < epsilon:
                return self.env.action_space.sample()
            else:
                return np.argmax(self.model.predict(state)[0])
        else:

            possible_actions = self.get_legal_actions(state)

            if len(possible_actions) == 0:
                return None

            if np.random.random() < epsilon:
                chosen_action = random.choice(possible_actions)
            else:
                chosen_action = self.get_best_action(state)

            return chosen_action

    def get_best_action(self, state):
        """
        Compute the best action to take in a state.
        """

        #
        # INSERT CODE HERE to get best possible action in a given state (remember to break ties randomly)
        #
        best_action = np.argmax(self.model.predict(state))

        return best_action

    def replay(self, batch_size):
        """
        Function learn network using randomly selected actions from the memory. 
        First calculates Q value for the next state and choose action with the biggest value.
        Target value is calculated according to:
                Q(s,a) := (r + gamma * max_a(Q(s', a)))
        except the situation when the next action is the last action, in such case Q(s, a) := r.
        In order to change only those weights responsible for chosing given action, the rest values should be those
        returned by the network for state state.
        The network should be trained on batch_size samples.
        After each 10 Q Network trainings parameters should be copied to the target Q Network
        """
        #
        # INSERT CODE HERE to train network
        #
        if len(self.memory) < batch_size:
            return

        info_sets = random.sample(self.memory, batch_size)
        states_list = []
        targets_list = []
        for info_set in info_sets:
            state, action, reward, next_state, done = info_set
            states_list.append(state.flatten())
            target = self.target_model.predict(state)
            if done:
                target[0][action] = reward
            else:
                Q_future = max(self.target_model.predict(next_state)[0])
                target[0][action] = reward + Q_future * self.gamma
            targets_list.append(target.flatten())

        states_array = np.array(states_list)
        targets_array = np.array(targets_list)

        self.model.train_on_batch(states_array, targets_array)
        self.update_epsilon_value()
        self.replay_counter += 1
        if self.replay_counter >= 10:
            self.update_weights()
            self.replay_counter = 0
  
    def update_epsilon_value(self):
        #Every each epoch epsilon value should be updated according to equation: 
        #self.epsilon *= self.epsilon_decay, but the updated value shouldn't be lower then epsilon_min value
        new_epsilon = self.epsilon * self.epsilon_decay
        if new_epsilon >= self.epsilon_min:
            self.epsilon = new_epsilon

    def update_weights(self):
        """copy trained Q Network params to target Q Network"""
        #
        # INSERT CODE HERE to train network
        #
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
            # target_weights[i] = weights[i]
        self.target_model.set_weights(target_weights)


Czas przygotować model sieci, która będzie się uczyła działania w środowisku [*CartPool*](https://gym.openai.com/envs/CartPole-v0/):

In [4]:
env = gym.make("CartPole-v0").env
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.001

model1 = Sequential()
model1.add(Dense(16, input_dim=state_size, activation="relu"))
model1.add(Dense(32, activation="relu"))
model1.add(Dense(16,activation='relu'))
model1.add(Dense(action_size))
model1.compile(loss="mean_squared_error", optimizer=Adam(learning_rate=learning_rate))

model2 = Sequential()
model2.add(Dense(16, input_dim=state_size, activation="relu"))
model2.add(Dense(32, activation="relu"))
model2.add(Dense(16,activation='relu'))
model2.add(Dense(action_size))
model2.compile(loss="mean_squared_error", optimizer=Adam(learning_rate=learning_rate))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Czas nauczyć agenta gry w środowisku *CartPool*:

In [5]:
agent = DDQNAgent(action_size, learning_rate, model1, model2, env=env)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 1000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        state = np.array([np.array(env_state).flatten()])

        for time in range(500):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = np.array([np.array(next_state_env).flatten()])

            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

        summary.append(total_reward)
        
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))    
    
    if np.mean(summary) > 195:
        print ("You Win!")
        break



epoch #0	mean reward = 18.630	epsilon = 0.680
epoch #1	mean reward = 14.890	epsilon = 0.615
epoch #2	mean reward = 13.360	epsilon = 0.557
epoch #3	mean reward = 17.420	epsilon = 0.504
epoch #4	mean reward = 31.440	epsilon = 0.456
epoch #5	mean reward = 45.390	epsilon = 0.412
epoch #6	mean reward = 57.860	epsilon = 0.373
epoch #7	mean reward = 116.950	epsilon = 0.338
epoch #8	mean reward = 87.160	epsilon = 0.305
epoch #9	mean reward = 101.650	epsilon = 0.276
epoch #10	mean reward = 146.140	epsilon = 0.250
epoch #11	mean reward = 195.660	epsilon = 0.226
You Win!
