# Laboratorium 6

Celem szóstego laboratorium jest zapoznanie się oraz zaimplementowanie algorytmu głębokiego uczenia aktywnego - REINFORCE. Zaimplementowany algorytm będzie testowany z wykorzystaniem środowiska z OpenAI - *CartPole*.


Dołączenie standardowych bibliotek

In [3]:
from collections import deque
import gym
import numpy as np
import random
import matplotlib.pyplot as plt
from numpy import ndarray
from tqdm import tqdm
%matplotlib inline

Dołączenie bibliotek do obsługi sieci neuronowych

In [4]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np

In [30]:

def get_cumulative_rewards(rewards, gamma=0.99):
    return np.array([np.sum([rewards[j] * gamma**(j - i) for j in range(i, len(rewards))]) for i in range(len(rewards))])


assert len(get_cumulative_rewards(range(100))) == 100
assert np.allclose(get_cumulative_rewards([0, 0, 1, 0, 0, 1, 0], gamma=0.9),
                   [1.40049, 1.5561, 1.729, 0.81, 0.9, 1.0, 0.0])
assert np.allclose(get_cumulative_rewards([0, 0, 1, -2, 3, -4, 0], gamma=0.5),
                   [0.0625, 0.125, 0.25, -1.5, 1.0, -4.0, 0.0])
assert np.allclose(get_cumulative_rewards([0, 0, 1, 2, 3, 4, 0], gamma=0), [0, 0, 1, 2, 3, 4, 0])

## Zadanie 1 - REINFORCE

<p style='text-align: justify;'>
Celem ćwiczenie jest zaimplementowanie algorytmu REINFORCE. Wagi sieci aktualizowane są zgodnie ze wzorem:
\begin{equation*}
    \theta \leftarrow \theta + \alpha G_t \nabla_\theta log \pi_{\theta}(a_t, s_t | \theta)
\end{equation*}.
</p>

In [31]:
class PGN(nn.Module):
    def __init__(self, lr, state_shape, n_actions, fc1, fc2):
        super(PGN, self).__init__()

        self.fc1 = nn.Linear(state_shape, fc1)
        self.fc2 = nn.Linear(fc1, fc2)
        self.output = nn.Linear(fc2, n_actions)
        self.softmax = nn.Softmax(dim=-1)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.device = T.device('cuda:0') if T.cuda.is_available() else T.device('cpu')
        self.to(self.device)

    def forward(self, state):
        state = F.relu(self.fc1(state))
        state = F.relu(self.fc2(state))
        probs = self.softmax(self.output(state))

        return probs

In [32]:
class REINFORCEAgent:
    def __init__(self, state_shape, n_actions, fc1: int = 128, fc2: int = 128, lr: float = 0.001, gamma: float = 0.99):
        self.action_size = n_actions
        self.gamma = gamma
        self.model = PGN(lr, state_shape, n_actions, fc1, fc2)
        self.action_log_memory = []
        self.reward_memory = []

    def store_transition(self, action_log, reward):
        self.action_log_memory.append(action_log)
        self.reward_memory.append(reward)

    def choose_action(self, state):
        state = T.tensor(state).to(self.model.device)
        probs = self.model.forward(state)
        cat = Categorical(probs)
        action = cat.sample()

        return action.item(), cat.log_prob(action)

    def learn(self):
        G = get_cumulative_rewards(self.reward_memory, self.gamma)
        rewards = T.tensor(G).to(self.model.device)
        log_probs = T.stack(self.action_log_memory)

        loss = -T.mean(rewards * log_probs)
        self.model.optimizer.zero_grad()
        loss.backward()
        self.model.optimizer.step()

        self.action_log_memory = []
        self.reward_memory = []

Czas przygotować model sieci, która będzie się uczyła działania w środowisku [*CartPool*](https://gym.openai.com/envs/CartPole-v0/):

In [33]:
env = gym.make("CartPole-v1")
state_shape = env.observation_space.shape[0]
n_actions = env.action_space.n
learning_rate = 0.001

np.bool8 = np.bool_
agent = REINFORCEAgent(state_shape=state_shape, n_actions=n_actions, lr=learning_rate, gamma=0.99, fc1=128, fc2=128)


Przygotuj funkcję obliczającą wartość nagrody skumulowanej:

Czas nauczyć agenta gry w środowisku *CartPool*:

In [None]:
def generate_session(t_max=1000):
    """play env with REINFORCE agent and train at the session end"""

    reward = 0

    state = env.reset()
    state = state[0].astype(np.float32)

    for t in range(t_max):

        # chose action
        action, action_log = agent.choose_action(state)

        _state, reward, done, _, info = env.step(action)
        _state = _state.astype(np.float32)

        # record session history to train later
        agent.store_transition(action_log, reward)

        reward += reward

        state = _state
        if done:
            break

    agent.learn()

    return reward


for i in range(100):

    rewards = [generate_session() for _ in range(100)]  # generate new sessions

    print("mean reward:%.3f" % (np.mean(rewards)))

    if np.mean(rewards) > 300:
        print("You Win!")
        break
        break

mean reward:22.320
mean reward:37.040
mean reward:58.470
mean reward:142.790
mean reward:219.510
mean reward:291.090
mean reward:115.850
mean reward:93.040
mean reward:144.320
mean reward:272.700
mean reward:1000.000
You Win!
