# Laboratorium 4 (4 pkt.)

Celem czwartego laboratorium jest zapoznanie się oraz zaimplementowanie algorytmów głębokiego uczenia aktywnego. Zaimplementowane algorytmy będą testowane z wykorzystaniem wcześniej przygotowanych środowisk: *FrozenLake* i *Pacman* oraz środowiska z OpenAI - *CartPole*.


Dołączenie standardowych bibliotek

In [None]:
from collections import deque
import gym
import numpy as np
import random

In [2]:
from tqdm import tqdm

Dołączenie bibliotek ze środowiskami:

In [3]:
from env.FrozenLakeMDP import frozenLake
from env.FrozenLakeMDPExtended import frozenLakeExtended


Dołączenie bibliotek do obsługi sieci neuronowych

In [4]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## Zadanie 1 - Deep Q-Network

<p style='text-align: justify;'>
Celem ćwiczenie jest zaimplementowanie algorytmu Deep Q-Network. Wartoscią oczekiwaną sieci jest:

\begin{equation}
        Q(s_t, a_t) = r_{t+1} + \gamma \text{max}_a Q(s_{t + 1}, a)
\end{equation}

</p>

In [6]:
class ReplayBuffer(object):
    def __init__(self, mem_size, state_shape):
        self.mem_size = mem_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, state_shape), dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, state_shape), dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=bool)

    def store_transition(self, state, action, reward, _state, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = _state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = done
        self.mem_cntr += 1

    @staticmethod
    def _softmax(x: np.ndarray) -> np.ndarray:
        return np.exp(x) / np.exp(x).sum()

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)

        # probs = self._softmax(1 - self.reward_memory[:max_mem])
        batch = np.random.choice(max_mem, batch_size)

        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        _states = self.new_state_memory[batch]
        done = self.terminal_memory[batch]

        return states, actions, rewards, _states, done

In [None]:
class DQNAgent: # Pytorch
    def __init__(self, action_size, state_size, learning_rate, model):
        self.action_size = action_size
        self.memory = ReplayBuffer(2000, state_size)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.05
        self.learning_rate = learning_rate
        self.q = model
        self.evaluate = False

    def remember(self, state, action, reward, _state, done):
        self.memory.store_transition(state, action, reward, _state, done)
 
    def get_action(self, state):
        if np.random.random() <= self.epsilon and not self.evaluate:
            action = np.random.choice(self.action_size)
        else:
            state = T.tensor(state).to(self.q.device)
            actions = self.q.forward(state)
            action = T.argmax(actions).item()

        return action

    def get_best_action(self, state):
        state = T.tensor(state).to(self.q.device)
        actions = self.q.forward(state)
        action = T.argmax(actions).item()

        return action

    def learn(self, batch_size):
        if self.memory.mem_cntr < batch_size:
            return

        states, actions, rewards, _states, done = self.memory.sample_buffer(batch_size)

        states = T.tensor(states).to(self.q.device)
        _states = T.tensor(_states).to(self.q.device)

        q_test = self.q(states)
        with T.no_grad():
            q_next = self.q.forward(_states).cpu().detach().numpy()
            q_target = q_test.cpu().detach().numpy().copy()

            max_actions = np.argmax(q_next, axis=1)

            batch_index = np.arange(batch_size, dtype=np.int32)

            q_target[batch_index, actions] = rewards + self.gamma * q_next[batch_index, max_actions] * (1-done)
            q_target = T.tensor(q_target).to(self.q.device)

        q_pred = self.q(states)
        self.q.optimizer.zero_grad()
        loss = self.q.loss(q_pred, q_target).to(self.q.device)
        loss.backward()
        self.q.optimizer.step()

    def update_epsilon_value(self):
        self.epsilon = self.epsilon - self.epsilon_decay if self.epsilon > self.epsilon_min else self.epsilon_min

Czas przygotować model sieci, która będzie się uczyła poruszania po środowisku *FrozenLake*, warstwa wejściowa powinna mieć tyle neuronów ile jest możlliwych stanów, warstwa wyjściowa tyle neuronów ile jest możliwych akcji do wykonania:

In [8]:
class DQN(nn.Module):
    def __init__(self, lr, state_shape, n_actions, fc1, fc2):
        super(DQN, self).__init__()

        self.fc1 = nn.Linear(state_shape, fc1)
        self.fc2 = nn.Linear(fc1, fc2)
        self.output = nn.Linear(fc2, n_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0') if T.cuda.is_available() else T.device('cpu')
        self.to(self.device)

    def forward(self, state):
        state = F.relu(self.fc1(state))
        state = F.relu(self.fc2(state))
        actions = self.output(state)

        return actions

In [9]:
env = frozenLake("8x8")

state_size = env.get_number_of_states()
action_size = len(env.get_possible_actions(None))
learning_rate = 0.001

 Czas nauczyć agenta poruszania się po środowisku *FrozenLake*, jako stan przyjmij wektor o liczbie elementów równej liczbie możliwych stanów, z wartością 1 ustawioną w komórce o indeksie równym aktualnemu stanowi, pozostałe elementy mają być wypełnione zerami:
* 1 pkt < 35 epok,
* 0.5 pkt < 60 epok,
* 0.25 pkt - w pozostałych przypadkach.

In [10]:
model = DQN(learning_rate, state_size, action_size, 128, 128)

agent = DQNAgent(action_size, state_size, learning_rate, model)
agent.epsilon = 1

done = False
batch_size = 64
EPISODES = 60
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in tqdm(range(100), desc=f'Epoch: {e}', disable=True):
        total_reward = 0
        i_state = env.reset()
    
        state = np.zeros(state_size, dtype=np.float32)
        state[i_state] = 1

        for time in range(1000):
            action = agent.get_action(state)
            _i_state, reward, done, _ = env.step(action)
            total_reward += reward

            _state = np.zeros(state_size, dtype=np.float32)
            _state[_i_state] = 1

            if np.allclose(state, _state):
                reward = -1

            if done and not reward:
                reward = -1

            agent.remember(state, action, reward, _state, done)
            agent.learn(batch_size)
            
            state = _state
            if done:
                break

        summary.append(total_reward)
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    agent.update_epsilon_value()

    if np.mean(summary) > 0.9:
        print ("You Win!")
        break

epoch #0	mean reward = 0.010	epsilon = 1.000
epoch #1	mean reward = 0.000	epsilon = 0.950
epoch #2	mean reward = 0.010	epsilon = 0.900
epoch #3	mean reward = 0.020	epsilon = 0.850
epoch #4	mean reward = 0.020	epsilon = 0.800
epoch #5	mean reward = 0.060	epsilon = 0.750
epoch #6	mean reward = 0.080	epsilon = 0.700
epoch #7	mean reward = 0.120	epsilon = 0.650
epoch #8	mean reward = 0.200	epsilon = 0.600
epoch #9	mean reward = 0.260	epsilon = 0.550
epoch #10	mean reward = 0.390	epsilon = 0.500
epoch #11	mean reward = 0.420	epsilon = 0.450
epoch #12	mean reward = 0.380	epsilon = 0.400
epoch #13	mean reward = 0.570	epsilon = 0.350
epoch #14	mean reward = 0.520	epsilon = 0.300
epoch #15	mean reward = 0.730	epsilon = 0.250
epoch #16	mean reward = 0.670	epsilon = 0.200
epoch #17	mean reward = 0.780	epsilon = 0.150
epoch #18	mean reward = 0.910	epsilon = 0.100
You Win!


Czas przygotować model sieci, która będzie się uczyła poruszania po środowisku *FrozenLakeExtended*, tym razem stan nie jest określany poprzez pojedynczą liczbę, a przez 3 tablice:
* pierwsza zawierająca informacje o celu,
* druga zawierająca informacje o dziurach,
* trzecia zawierająca informację o położeniu gracza.

In [12]:
env = frozenLakeExtended("4x4")

state_size = env.get_number_of_states()
action_size = len(env.get_possible_actions(None))
learning_rate = 0.001

 Czas nauczyć agenta poruszania się po środowisku *FrozenLakeExtended*, jako stan przyjmij wektor składający się ze wszystkich trzech tablic (2 pkt.):

In [15]:
model = DQN(learning_rate, 48, action_size, 128, 64)

agent = DQNAgent(action_size, 48, learning_rate, model)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 2000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        state = env.reset()
        state = np.array(state).reshape(-1,).astype(np.float32)

        
        for time in range(1000):
            action = agent.get_action(state)
            _state, reward, done, _ = env.step(action)
            _state = np.array(_state).reshape(-1,).astype(np.float32)
            total_reward += reward

            if np.allclose(state, _state):
                reward = -1

            if done and not reward:
                reward = -1

            agent.remember(state, action, reward, _state, done)
            agent.learn(batch_size)
            state = _state
            if done:
                break


        summary.append(total_reward)

    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    agent.update_epsilon_value()
    if np.mean(summary) > 0.9:
        print ("You Win!")
        break

epoch #0	mean reward = 0.000	epsilon = 0.750
epoch #1	mean reward = 0.010	epsilon = 0.700
epoch #2	mean reward = 0.140	epsilon = 0.650
epoch #3	mean reward = 0.180	epsilon = 0.600
epoch #4	mean reward = 0.210	epsilon = 0.550
epoch #5	mean reward = 0.300	epsilon = 0.500
epoch #6	mean reward = 0.410	epsilon = 0.450
epoch #7	mean reward = 0.470	epsilon = 0.400
epoch #8	mean reward = 0.580	epsilon = 0.350
epoch #9	mean reward = 0.590	epsilon = 0.300
epoch #10	mean reward = 0.670	epsilon = 0.250
epoch #11	mean reward = 0.770	epsilon = 0.200
epoch #12	mean reward = 0.780	epsilon = 0.150
epoch #13	mean reward = 0.920	epsilon = 0.100
You Win!


Czas przygotować model sieci, która będzie się uczyła działania w środowisku [*CartPool*](https://gym.openai.com/envs/CartPole-v0/):

In [98]:
np.bool8 = np.bool_
env = gym.make("CartPole-v1")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.005

Czas nauczyć agenta gry w środowisku *CartPool*:
* 1 pkt < 10 epok,
* 0.5 pkt < 20 epok,
* 0.25 pkt - w pozostałych przypadkach.

In [99]:
model = DQN(learning_rate, state_size, action_size, state_size, state_size)
agent = DQNAgent(action_size, state_size, learning_rate, model)
agent.epsilon_min = 0.01
agent.epsilon = 0.8

done = False
batch_size = 64
EPISODES = 1000
counter = 0
for e in range(EPISODES):
    summary = []
    for i in range(100):
        total_reward = 0
        state = env.reset()
        state = state[0].astype(np.float32)

        for time in range(300):
            action = agent.get_action(state)
            # print(f'{action=}')
            # _step = env.step(action)
            # __step = env.step(action)[0]
            # print(f'{_step=}')
            # print(f'{__step=}')
            # print(env.step(action))
            _state, reward, done, _, _= env.step(action)
            
            _state = _state.astype(np.float32)

            total_reward += reward

            # if done:
            #     reward = -100

            agent.remember(state, action, reward, _state, done)
            agent.learn(batch_size)
            state = _state
            if done:
                break

        summary.append(total_reward)
    # print(f'{summary=}')
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    agent.update_epsilon_value()
    agent.update_epsilon_value()
    if np.mean(summary) > 195:
        print ("You Win!")
        break

epoch #0	mean reward = 19.990	epsilon = 0.800
epoch #1	mean reward = 23.540	epsilon = 0.700
epoch #2	mean reward = 54.730	epsilon = 0.600
epoch #3	mean reward = 78.280	epsilon = 0.500
epoch #4	mean reward = 113.330	epsilon = 0.400
epoch #5	mean reward = 146.310	epsilon = 0.300
epoch #6	mean reward = 182.000	epsilon = 0.200
epoch #7	mean reward = 155.860	epsilon = 0.100
epoch #8	mean reward = 122.970	epsilon = -0.000
epoch #9	mean reward = 147.220	epsilon = 0.010
epoch #10	mean reward = 228.730	epsilon = 0.010
You Win!
