In [1]:
!sudo apt update
!sudo apt install -y build-essential autoconf libtool pkg-config python3-dev \
    python3-pip python3-numpy git flex bison libbz2-dev

!wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | sudo apt-key add -
!sudo apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
!sudo apt-get update && apt-get --allow-unauthenticated install -y \
    cmake \
    kitware-archive-keyring

!sudo rm $(which cmake)
!$(which cmake) --version

!pip3 install -Uv nle
!apt-get install sox ffmpeg libcairo2 libcairo2-dev
!pip install manimlib pygame opencv-python minihack

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:6 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [1,419 kB]
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/multiverse amd64 Packages [49.9 kB]
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 1,8

In [2]:
# https://github.com/BY571/SAC_discrete/blob/main/train.py
# https://github.com/BrentonBudler/deep-rl-minihack-the-planet

In [3]:
from torch.nn import Linear, ReLU, CrossEntropyLoss, Sequential, Conv2d, MaxPool2d, Module, Softmax, BatchNorm2d, Dropout


Models

In [4]:
import torch
import torch.nn as nn
from torch.distributions import Categorical
import numpy as np
import torch.nn.functional as F
import gym
import minihack



def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

class Actor(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, hyperparams):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(Actor, self).__init__()

        # The network architecture follows the popular lenet-5 CNN architeture
        # Initialize first set of convolutional and pooling layers with a ReLU activation function
        self.conv1 = Conv2d(in_channels=1, out_channels=20,
                            kernel_size=(5, 5))
        self.relu1 = ReLU()
        self.maxpool1 = MaxPool2d(kernel_size=(2, 2), stride=(2, 2))

        # Initialize second set of of convolutional and pooling layers with a ReLU activation function
        self.conv2 = Conv2d(in_channels=20, out_channels=50,
                            kernel_size=(5, 5))
        self.relu2 = ReLU()
        self.maxpool2 = MaxPool2d(kernel_size=(2, 2), stride=(2, 2))

        # Initialize fully connected layers for glyph output after convolutional and pooling layers
        self.fc1 = Linear(in_features=1600, out_features=500)
        self.relu3 = ReLU()
        self.fc2 = Linear(in_features=500, out_features=128)
        self.relu4 = ReLU()

        # Initialize fully connected for message input
        self.fc3 = Linear(in_features=256, out_features=128)
        self.relu5 = ReLU()

        # Initialize fully connected for combination of glyphs and message
        self.fc4 = Linear(in_features=256, out_features=128)
        self.relu6 = ReLU()

        # To calculate the probability of taking each action in the given state
        self.action_layer = nn.Linear(128, action_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, state):
       # Transform the glyph and state arrays into tensors

        if (type(state) is dict): # if state is just a dict with 2 keys (glyphs, message)
          message_t  = torch.from_numpy(state["message"]).float().to(device)
          glyphs_t  = torch.from_numpy(state["glyphs"]).float().to(device)
        else: # if state is a batch - here, state is an array of dicts - if you call agent.learn()
            glyphs_t  = torch.from_numpy(np.array([ s["glyphs"] for s in state])).float().to(device)
            message_t  = torch.from_numpy(np.array([ s["message"] for s in state])).float().to(device)

            glyphs_t = torch.squeeze(glyphs_t, 1) # remove all dimensions with 1
            message_t = torch.squeeze(message_t, 1) # remove all dimensions with 1


        # Pass the 2D glyphs input through our convolutional and pooling layers
        glyphs_t = self.conv1(glyphs_t)
        glyphs_t = self.relu1(glyphs_t)
        glyphs_t = self.maxpool1(glyphs_t)
        glyphs_t = self.conv2(glyphs_t)
        glyphs_t = self.relu2(glyphs_t)
        glyphs_t = self.maxpool2(glyphs_t)

        # Platten the output from the final pooling layer and pass it through the fully connected layers
        glyphs_t = glyphs_t.reshape(glyphs_t.shape[0], -1)
        glyphs_t = self.fc1(glyphs_t)
        glyphs_t = self.relu3(glyphs_t)
        glyphs_t = self.fc2(glyphs_t)
        glyphs_t = self.relu4(glyphs_t)

        # Pass the message input through a fully connected layer
        message_t = self.fc3(message_t)
        message_t = self.relu5(message_t)

        # Combine glyphs output from convolution and fully connected layers
        # with message output from fully connected layer
        # Cat and Concat are used for different versions of PyTorch
        try:
            combined = torch.cat((glyphs_t,message_t),1)
        except:
            combined = torch.concat([glyphs_t,message_t],1)

        # Pass glyphs and messaged combination through a fully connected layer
        combined = self.fc4(combined)
        combined = self.relu6(combined)

        action_probs = self.action_layer(combined)
        action_probs = self.softmax(action_probs)

        return action_probs

    def evaluate(self, state, epsilon=1e-6):
        action_probs = self.forward(state)

        dist = Categorical(action_probs)
        action = dist.sample()
        # Have to deal with situation of 0.0 probabilities because we can't do log 0
        z = action_probs == 0.0
        z = z.float() * 1e-8
        log_action_probabilities = torch.log(action_probs + z)
        return action.detach().cpu(), action_probs, log_action_probabilities

    def get_action(self, state):
        """
        returns the action based on a squashed gaussian policy. That means the samples are obtained according to:
        a(s,e)= tanh(mu(s)+sigma(s)+e)
        """
        action_probs = self.forward(state)

        dist = Categorical(action_probs)
        action = dist.sample().to(device)
        # Have to deal with situation of 0.0 probabilities because we can't do log 0
        z = action_probs == 0.0
        z = z.float() * 1e-8
        log_action_probabilities = torch.log(action_probs + z)
        return action.detach().cpu(), action_probs, log_action_probabilities

    def get_det_action(self, state):
        action_probs = self.forward(state)
        dist = Categorical(action_probs)
        action = dist.sample().to(device)
        return action.detach().cpu()


class Critic(nn.Module):
    """Critic (Value) Model."""

    def __init__(self, state_size, action_size, hyperparams):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            hidden_size (int): Number of nodes in the network layers
        """
        super(Critic, self).__init__()
        self.seed = torch.manual_seed(hyperparams["seed"])
        # Initialize first set of convolutional and pooling layers with a ReLU activation function
        self.conv1 = Conv2d(in_channels=1, out_channels=20,
                            kernel_size=(5, 5))
        self.relu1 = ReLU()
        self.maxpool1 = MaxPool2d(kernel_size=(2, 2), stride=(2, 2))

        # Initialize second set of of convolutional and pooling layers with a ReLU activation function
        self.conv2 = Conv2d(in_channels=20, out_channels=50,
                            kernel_size=(5, 5))
        self.relu2 = ReLU()
        self.maxpool2 = MaxPool2d(kernel_size=(2, 2), stride=(2, 2))

        # Initialize fully connected layers for glyph output after convolutional and pooling layers
        self.fc1 = Linear(in_features=1600, out_features=500)
        self.relu3 = ReLU()
        self.fc2 = Linear(in_features=500, out_features=128)
        self.relu4 = ReLU()

        # Initialize fully connected for message input
        self.fc3 = Linear(in_features=256, out_features=128)
        self.relu5 = ReLU()

        # Initialize fully connected for combination of glyphs and message
        self.fc4 = Linear(in_features=256, out_features=128)
        self.relu6 = ReLU()

        # To estimate the value function of the state
        self.value_layer = nn.Linear(128, 1)

    def forward(self, state):
        """Build a critic (value) network that maps (state, action) pairs -> Q-values."""
         # Transform the glyph and state arrays into tensors

        # if (len(state) <= 2): # if state is just a dict with 2 keys (glyphs, message)
        if (type(state) is dict):
          message_t  = torch.from_numpy(state["message"]).float().to(device)
          glyphs_t  = torch.from_numpy(state["glyphs"]).float().to(device)
        else: # if state is a batch - here, state is an array of dicts
            glyphs_t  = torch.from_numpy(np.array([ s["glyphs"] for s in state])).float().to(device)
            message_t  = torch.from_numpy(np.array([ s["message"] for s in state])).float().to(device)

            glyphs_t = torch.squeeze(glyphs_t, 1) # remove all dimensions with 1
            message_t = torch.squeeze(message_t, 1) # remove all dimensions with 1

        # Pass the 2D glyphs input through our convolutional and pooling layers
        glyphs_t = self.conv1(glyphs_t)
        glyphs_t = self.relu1(glyphs_t)
        glyphs_t = self.maxpool1(glyphs_t)
        glyphs_t = self.conv2(glyphs_t)
        glyphs_t = self.relu2(glyphs_t)
        glyphs_t = self.maxpool2(glyphs_t)

        # Platten the output from the final pooling layer and pass it through the fully connected layers
        glyphs_t = glyphs_t.reshape(glyphs_t.shape[0], -1)
        glyphs_t = self.fc1(glyphs_t)
        glyphs_t = self.relu3(glyphs_t)
        glyphs_t = self.fc2(glyphs_t)
        glyphs_t = self.relu4(glyphs_t)

        # Pass the message input through a fully connected layer
        message_t = self.fc3(message_t)
        message_t = self.relu5(message_t)

        # Combine glyphs output from convolution and fully connected layers
        # with message output from fully connected layer
        # Cat and Concat are used for different versions of PyTorch
        try:
            combined = torch.cat((glyphs_t,message_t),1)
        except:
            combined = torch.concat([glyphs_t,message_t],1) # 256 x 256 , because message_t and glyphs_t is 256x128 and we concat on 1 dimension (not 0th dim)

        # Pass glyphs and messaged combination through a fully connected layer
        combined = self.fc4(combined)
        combined = self.relu6(combined)

        # Pass the output from the previous fully connected layer through two seperate
        # fully connected layers, one with a single output neuron (to estimate the state value function)
        # and the other with the number of output neurons equal to the number of actions
        # (to estimate the action probabilities)
        state_value = self.value_layer(combined)

        return state_value

Agent

In [5]:
import torch
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_
import copy


class SAC(nn.Module):
    """Interacts with and learns from the environment."""

    def __init__(self,
                        state_size,
                        action_size,
                        hyperparams,
                        device
                ):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        super(SAC, self).__init__()
        self.state_size = state_size
        self.action_size = action_size

        self.device = device

        self.gamma = hyperparams["discount"]
        self.tau = hyperparams["interpolation_factor"]
        hidden_size = hyperparams["hidden_size"]
        learning_rate = hyperparams["lr"]
        self.clip_grad_param = hyperparams["clip_grad_param"]

        self.target_entropy = -action_size  # -dim(A)

        self.log_alpha = torch.tensor([0.0], requires_grad=True)
        self.alpha = self.log_alpha.exp().detach()
        self.alpha_optimizer = optim.Adam(params=[self.log_alpha], lr=learning_rate)

        # Actor Network

        self.actor_local = Actor(state_size, action_size, hyperparams).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=learning_rate)

        # Critic Network (w/ Target Network)

        self.critic1 = Critic(state_size, action_size, hyperparams).to(device)
        self.critic2 = Critic(state_size, action_size, hyperparams).to(device)

        assert self.critic1.parameters() != self.critic2.parameters()

        self.critic1_target = Critic(state_size, action_size, hyperparams).to(device)
        self.critic1_target.load_state_dict(self.critic1.state_dict())

        self.critic2_target = Critic(state_size, action_size, hyperparams).to(device)
        self.critic2_target.load_state_dict(self.critic2.state_dict())

        self.critic1_optimizer = optim.Adam(self.critic1.parameters(), lr=learning_rate)
        self.critic2_optimizer = optim.Adam(self.critic2.parameters(), lr=learning_rate)


    def get_action(self, state):
        """Returns actions for given state as per current policy."""
        with torch.no_grad():
            action = self.actor_local.get_det_action(state)
        return action.numpy()

    def calc_policy_loss(self, states, alpha):
        _, action_probs, log_pis = self.actor_local.evaluate(states)

        q1 = self.critic1(states)
        q2 = self.critic2(states)
        min_Q = torch.min(q1,q2)
        actor_loss = (action_probs * (alpha * log_pis - min_Q )).sum(1).mean()
        log_action_pi = torch.sum(log_pis * action_probs, dim=1)
        return actor_loss, log_action_pi

    def learn(self, step, experiences, gamma, d=1):
        """Updates actor, critics and entropy_alpha parameters using given batch of experience tuples.
        Q_targets = r + γ * (min_critic_target(next_state, actor_target(next_state)) - α *log_pi(next_action|next_state))
        Critic_loss = MSE(Q, Q_target)
        Actor_loss = α * log_pi(a|s) - Q(s,a)
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences


        # ---------------------------- update actor ---------------------------- #
        current_alpha = copy.deepcopy(self.alpha)
        actor_loss, log_pis = self.calc_policy_loss(states, current_alpha.to(self.device))
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Compute alpha loss # α > 0 is called the temperature and determines the trade-off between received rewards and randomness of the policy
        alpha_loss = - (self.log_alpha.exp() * (log_pis.cpu() + self.target_entropy).detach().cpu()).mean()
        self.alpha_optimizer.zero_grad()
        alpha_loss.backward()
        self.alpha_optimizer.step()
        self.alpha = self.log_alpha.exp().detach()

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        with torch.no_grad():
            _, action_probs, log_pis = self.actor_local.evaluate(next_states)
            Q_target1_next = self.critic1_target(next_states)
            Q_target2_next = self.critic2_target(next_states)
            Q_target_next = action_probs * (torch.min(Q_target1_next, Q_target2_next) - self.alpha.to(self.device) * log_pis)

            # Compute Q targets for current states (y_i)
            Q_targets = rewards + (gamma * (1 - dones) * Q_target_next.sum(dim=1).unsqueeze(-1))

        # Compute critic loss

        q1 = self.critic1(states).gather(0, actions.long()) # Gathers values along an axis specified by dim. input_tensor.gather(dim, index) . index is long tensor.
        q2 = self.critic2(states).gather(0, actions.long()) # input and index must be same dimension

        critic1_loss = 0.5 * F.mse_loss(q1, Q_targets)
        critic2_loss = 0.5 * F.mse_loss(q2, Q_targets)

        # Update critics
        # critic 1
        self.critic1_optimizer.zero_grad()
        critic1_loss.backward(retain_graph=True)
        clip_grad_norm_(self.critic1.parameters(), self.clip_grad_param)
        self.critic1_optimizer.step()
        # critic 2
        self.critic2_optimizer.zero_grad()
        critic2_loss.backward()
        clip_grad_norm_(self.critic2.parameters(), self.clip_grad_param)
        self.critic2_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic1, self.critic1_target)
        self.soft_update(self.critic2, self.critic2_target)

        return actor_loss.item(), alpha_loss.item(), critic1_loss.item(), critic2_loss.item(), current_alpha

    def soft_update(self, local_model , target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.tau*local_param.data + (1.0-self.tau)*target_param.data)

Format states

In [6]:
def format_state(state):
    """Formats the state according to the input requirements of the Actor Critic Neural Network"""

    # Normalize and reshape for convolutional layer input
    glyphs = state["glyphs"]
    glyphs = glyphs/glyphs.max()
    glyphs = glyphs.reshape((1,1,21,79))

    # Normalize the message and reshape for the fully connected layer input
    message = state["message"]
    if state["message"].max()>0:
        # Occassionally the message is empty which will cause a Zero Division error
        message = message/message.max()
    message = message.reshape((1,len(message)))

    state = {"glyphs":glyphs,"message":message}
    return state

Collecting samples

In [7]:
def collect_random(env, dataset, num_samples):
    state = format_state(env.reset())
    for _ in range(num_samples):
        action = env.action_space.sample()
        next_state, reward, done, _ = env.step(action)
        next_state = format_state(next_state)
        dataset.add(state, action, reward, next_state, done)
        state = next_state
        if done:
            state = format_state(env.reset())

Replay Buffer

In [8]:
import numpy as np
import random
import torch
from collections import deque, namedtuple

class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, buffer_size, batch_size, device):
        """Initialize a ReplayBuffer object.
        Params
        ======
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.device = device
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)

        states = np.stack([e.state for e in experiences if e is not None])
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(self.device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device)
        next_states = np.stack([e.next_state for e in experiences if e is not None])
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)

        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

Training Agent

In [9]:
import gym
import numpy as np
from collections import deque
import torch
import argparse
import glob
import random
from nle import nethack


hyperparams = {
      "run_name": "SAC Discrete",
      # "env_name": "MiniHack-MazeWalk-9x9-v0",
      "env_name": "MiniHack-Room-5x5-v0",
      "episodes": 500,
      "buffer_size": int(1e6), # change
      "seed": 42,
      "log_video": 0,
      "save_every": 100,
      "batch_size": 256,
      "discount":0.99, # discount factor gamma
      "lr": 2e-4, # learning rate alpha
      "hidden_size": 256, # hidden layer size
      "interpolation_factor": 0.005,   #tau - for soft update
      "clip_grad_param":1,# gradient clipping
      "max_episode_steps":1000

  }


np.random.seed(hyperparams["seed"])
random.seed(hyperparams["seed"])
torch.manual_seed(hyperparams["seed"])

MOVE_ACTIONS = tuple(nethack.CompassDirection) + (
                nethack.Command.OPEN,
                nethack.Command.KICK
                )
env = gym.make(hyperparams["env_name"],observation_keys=("glyphs", "chars", "colors", "pixel", "message", "blstats", "pixel_crop"),
        actions=MOVE_ACTIONS,max_episode_steps=hyperparams["max_episode_steps"])

env.seed(hyperparams["seed"])
env.action_space.seed(hyperparams["seed"])
torch.cuda.manual_seed_all(hyperparams["seed"])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

steps = 0
average10 = deque(maxlen=10)
total_steps = 0

agent = SAC(state_size=1659,
                  action_size=len(MOVE_ACTIONS),
                  hyperparams=hyperparams,
                  device=device)

buffer = ReplayBuffer(buffer_size=hyperparams["buffer_size"], batch_size=hyperparams["batch_size"], device=device )

collect_random(env=env, dataset=buffer, num_samples=10000)

rewards_arr = []
policy_loss_arr = []
critic1_loss_arr = []

if hyperparams["log_video"]:
    env = gym.wrappers.Monitor(env, './video', video_callable=lambda x: x%10==0, force=True)

for i in range(1, hyperparams["episodes"]+1):
    state = format_state(env.reset())

    episode_steps = 0
    rewards = 0
    print(i)
    while True:
        action = agent.get_action(state)
        steps += 1
        next_state, reward, done, _ = env.step(action.item())
        next_state = format_state(next_state)

        buffer.add(state, action, reward, next_state, done)

        policy_loss, alpha_loss, bellmann_error1, bellmann_error2, current_alpha = agent.learn(steps, buffer.sample(), hyperparams["discount"])
        state = next_state
        rewards += reward
        episode_steps += 1
        if done:
            break



    average10.append(rewards)
    total_steps += episode_steps
    print("Episode: {} | Reward: {} | Policy Loss: {} | Steps: {}".format(i, rewards, policy_loss, steps,))
    rewards_arr.append(rewards)
    policy_loss_arr.append(policy_loss)
    critic1_loss_arr.append(bellmann_error1)


1
Episode: 1 | Reward: -0.9300000000000006 | Policy Loss: -5.128635883331299 | Steps: 191
2
Episode: 2 | Reward: 0.6399999999999999 | Policy Loss: -5.229089736938477 | Steps: 267
3
Episode: 3 | Reward: 0.4999999999999998 | Policy Loss: -5.936838150024414 | Steps: 358
4
Episode: 4 | Reward: -0.17000000000000082 | Policy Loss: -7.892458915710449 | Steps: 627
5
Episode: 5 | Reward: 0.22999999999999954 | Policy Loss: -8.645995140075684 | Steps: 761
6
Episode: 6 | Reward: 0.87 | Policy Loss: -8.789254188537598 | Steps: 797
7
Episode: 7 | Reward: 0.96 | Policy Loss: -9.39426326751709 | Steps: 812
8
Episode: 8 | Reward: -0.2900000000000001 | Policy Loss: -9.918540000915527 | Steps: 887
9
Episode: 9 | Reward: 0.92 | Policy Loss: -9.894004821777344 | Steps: 914
10
Episode: 10 | Reward: -0.37000000000000016 | Policy Loss: -10.444175720214844 | Steps: 983
11


KeyboardInterrupt: ignored

In [None]:
from google.colab import drive
drive.mount('/content/drive')

np.savetxt('/content/drive/MyDrive/reinforcement_learning/room_sac_minihack_conv/rewards.txt', rewards_arr)
np.savetxt('/content/drive/MyDrive/reinforcement_learning/room_sac_minihack_conv/policy_loss.txt', policy_loss_arr)
np.savetxt('/content/drive/MyDrive/reinforcement_learning/room_sac_minihack_conv/critic1_loss.txt', critic1_loss_arr)
print("total number of steps: ", steps)


In [None]:
import matplotlib.pyplot as plt

plt.plot(rewards_arr)
# plt.title('Average Reward on Mazewalk-9x9v0')
plt.title('Average Reward on Room-5x5v0')
plt.ylabel('Average Reward')
plt.xlabel('Episode')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.plot(policy_loss_arr)
plt.title('Policy Loss on Room-5x5v0')
plt.ylabel('Loss')
plt.xlabel('Episode')
plt.legend()
plt.show()

In [None]:
plt.plot(critic1_loss_arr)
plt.title('Critic 1 loss on Room-5x5v0')
plt.ylabel('Critic Loss')
plt.xlabel('Episode')
plt.legend()
plt.show()