In [1]:
%load_ext autoreload
%autoreload 2

import os
import gym
from matplotlib import pyplot as plt
import numpy as np
from ddpg_torch.ddpg_agent import Agent

from envs.escape_room_continuous_space_env import EscapeRoomEnv
from tqdm import trange

In [2]:


def train_diff_robot_custom_env(alpha=0.0001, beta=0.001, tau=0.001, n_games=1000):
    env = EscapeRoomEnv()
    agent = Agent(
        alpha=alpha,
        beta=beta,
        input_dims=env.observation_space.shape,
        tau=tau,
        batch_size=64,
        fc1_dims=400,
        fc2_dims=300,
        n_actions=env.action_space.shape[0],
    )

    filename = f"EscapeRoom_alpha_{agent.alpha}_beta_{agent.beta}_{n_games}_games"
    figure_file = f"plots/{filename}.png"
    score_history = []
    critic_losses = []
    actor_losses = []

    save_interval = n_games // 10  # Save model and plot every 10% of n_games
    pbar = trange(n_games)

    for i in pbar:
        state, info = env.reset()
        done = False
        score = 0

        while not done:
            action = agent.choose_action(state)
            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            agent.remember(state, action, reward, next_state, done)
            learn_outputs = agent.learn()
            if learn_outputs:
                critic_loss, actor_loss = learn_outputs
            else:
                critic_loss, actor_loss = (
                    0,
                    0,
                )  # Default values when learning doesn't happen

            score += reward
            state = next_state
        
        

        score_history.append(score)
        critic_losses.append(critic_loss)
        actor_losses.append(actor_loss)
        
        avg_score = np.mean(
            score_history
        )  # Calculate average score after appending current score

        if (
            i % save_interval == 0 or i == n_games - 1
        ):  # Save model and plot at intervals
            agent.save_models()

        pbar.set_description(
            f"Episode {i}: Score {score:.1f}, Info : {info['reason']}, Average Score {avg_score:.3f}, Actor Losses {actor_losses[-1]:.3f}, Critic Losses {critic_losses[-1]:.3f}"
        )

    return score_history, critic_losses, actor_losses, figure_file

In [3]:
score_history, critic_losses, actor_losses, figure_file = train_diff_robot_custom_env()

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  state = T.tensor([observation], dtype=T.float).to(self.actor.device)
Episode 53: Score 4080.3, Info : Goal_reached, Average Score -353.742, Actor Losses 2.588, Critic Losses 5.618:   5%|▌         | 54/1000 [01:05<30:06,  1.91s/it]      

Goal 'G' reached in 1085 steps with cumulative reward 5523.249428372609 for this episode.


Episode 96: Score 6496.9, Info : Goal_reached, Average Score -718.943, Actor Losses -57.780, Critic Losses 28.280:  10%|▉         | 97/1000 [03:36<44:34,  2.96s/it]         

Goal 'G' reached in 173 steps with cumulative reward 6266.664341183488 for this episode.


Episode 98: Score 5764.5, Info : Goal_reached, Average Score -683.163, Actor Losses -85.068, Critic Losses 203.605:  10%|▉         | 99/1000 [03:44<49:47,  3.32s/it]       

Goal 'G' reached in 440 steps with cumulative reward 5856.54732121648 for this episode.


Episode 107: Score 6324.3, Info : Goal_reached, Average Score -769.969, Actor Losses -140.388, Critic Losses 54.496:  11%|█         | 108/1000 [04:24<36:55,  2.48s/it]        

Goal 'G' reached in 227 steps with cumulative reward 6143.134454265237 for this episode.


Episode 108: Score 5327.6, Info : Goal_reached, Average Score -714.028, Actor Losses -109.288, Critic Losses 80.587:  11%|█         | 109/1000 [04:26<34:56,  2.35s/it]

Goal 'G' reached in 663 steps with cumulative reward 5696.630650404789 for this episode.


Episode 109: Score 4326.6, Info : Goal_reached, Average Score -668.203, Actor Losses -117.292, Critic Losses 25.907:  11%|█         | 110/1000 [04:29<38:11,  2.58s/it]

Goal 'G' reached in 1011 steps with cumulative reward 5546.486550187122 for this episode.


Episode 131: Score 6392.2, Info : Goal_reached, Average Score -906.420, Actor Losses -191.287, Critic Losses 14.130:  13%|█▎        | 132/1000 [06:17<55:49,  3.86s/it]         

Goal 'G' reached in 224 steps with cumulative reward 6147.590853588412 for this episode.


Episode 144: Score 3589.6, Info : Goal_reached, Average Score -994.267, Actor Losses -199.750, Critic Losses 25.560:  14%|█▍        | 145/1000 [07:23<1:10:21,  4.94s/it]        

Goal 'G' reached in 1448 steps with cumulative reward 5434.688891756947 for this episode.


Episode 153: Score 3541.4, Info : Goal_reached, Average Score -977.520, Actor Losses -211.044, Critic Losses 41.714:  15%|█▌        | 154/1000 [07:59<1:01:14,  4.34s/it]        

Goal 'G' reached in 1395 steps with cumulative reward 5444.9037380709 for this episode.


Episode 155: Score 6262.1, Info : Goal_reached, Average Score -935.732, Actor Losses -207.799, Critic Losses 110.013:  16%|█▌        | 156/1000 [08:04<47:41,  3.39s/it]   

Goal 'G' reached in 265 steps with cumulative reward 6073.988383059912 for this episode.


Episode 162: Score -1880.4, Info : out_of_bounds, Average Score -988.260, Actor Losses -209.332, Critic Losses 26.686:  16%|█▋        | 163/1000 [08:32<58:24,  4.19s/it]     

In [None]:
import matplotlib.pyplot as plt

def plot_training_results(x, scores, critic_losses, actor_losses, figure_file):
    fig, axs = plt.subplots(3, 1, figsize=(10, 15))

    # Plotting the scores
    axs[0].plot(x, scores, label='Score per Episode', color='blue')
    axs[0].set_title('Scores Over Episodes')
    axs[0].set_xlabel('Episode')
    axs[0].set_ylabel('Score')
    axs[0].grid(True)
    axs[0].legend()

    # Plotting the critic losses
    axs[1].plot(x, critic_losses, label='Critic Loss per Episode', color='red')
    axs[1].set_title('Critic Loss Over Episodes')
    axs[1].set_xlabel('Episode')
    axs[1].set_ylabel('Loss')
    axs[1].grid(True)
    axs[1].legend()

    # Plotting the actor losses
    axs[2].plot(x, actor_losses, label='Actor Loss per Episode', color='green')
    axs[2].set_title('Actor Loss Over Episodes')
    axs[2].set_xlabel('Episode')
    axs[2].set_ylabel('Loss')
    axs[2].grid(True)
    axs[2].legend()

    plt.tight_layout()
    plt.savefig(figure_file)
    plt.show()


In [None]:

episodes = list(range(1, len(score_history) + 1))
plot_training_results(episodes, score_history, critic_losses, actor_losses, figure_file)


In [None]:
def load_and_simulate(env, agent, n_episodes=5, max_steps=500):
    rewards = []
    steps_per_episode = []

    for episode in range(n_episodes):
        state, info = env.reset()
        done = False
        total_reward = 0
        steps = 0

        while not done:
            env.render()  # Optional: Comment this out if you don't need to visually inspect the simulation
            action = agent.choose_action(state)
            state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            total_reward += reward
            steps += 1

            if steps >= max_steps:
                break

        print(f"Episode {episode + 1}: Total reward = {total_reward}, Steps = {steps}")
        rewards.append(total_reward)
        steps_per_episode.append(steps)

    env.close()  # Close the environment when done
    return rewards, steps_per_episode


In [None]:
def main():
    env = EscapeRoomEnv(max_steps_per_episode=500)
    agent = Agent(
        alpha=0.0001,
        beta=0.001,
        input_dims=env.observation_space.shape,
        tau=0.001,
        fc1_dims=400,
        fc2_dims=300,
        n_actions=env.action_space.shape[0],
        batch_size=64,
    )

    # Assume agent.load_models is properly implemented
    agent.load_models()

    rewards, steps_per_episode = load_and_simulate(env, agent, n_episodes=5, max_steps=1000)

if __name__ == "__main__":
    main()
