# Agent Training

importing dependencies

In [1]:
from __future__ import annotations
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
from torch.distributions.normal import Normal
from stable_baselines3 import PPO, TD3
import time
import gymnasium as gym
import pickle
import warnings
warnings.filterwarnings("ignore")


plt.rcParams["figure.figsize"] = (10, 5)

For Agent training, we choose to setup 2 different environments: 
- a default environment, with the rewards unchanged (with the exception of the healthy range, which we keep constant in all of our environments)
- a changed environment, with our custom rewards in an attempt to see a difference in how the agent learns

In each environment, we initialize 3 seeds and a different agent for every seed. The agent learns through a PPO algorithm using an MLP policy. 
For each seed, each different agent will learn for 2 million timesteps and then be tested on the environment for 1000 steps. Testing is always done in the default environment so we can compare the results. The goal is to compare the rewards the agents trained differently will receive when tested on the default environment

## Default Env Agent Training and Testing Implementation

training 3 agents on the default environment

In [2]:
rewards_over_seeds = []

for seed in [1,2,3]:
    print(f"Starting seed {seed}")
    steps_in_seed = 0
    env_train = gym.make("Humanoid-v4")
    env_test = gym.make("Humanoid-v4", healthy_z_range=(0.9,2.0))
    # Record each episode reward
    wrapped_env_train = gym.wrappers.RecordEpisodeStatistics(env_train)
    wrapped_env_test = gym.wrappers.RecordEpisodeStatistics(env_test)

    # Set seed
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    
    #initializing and training the agents
    print(f"Starting training")
    agent = PPO('MlpPolicy', wrapped_env_train, verbose=0, tensorboard_log="./logs_default")
    agent.learn(total_timesteps=2000000,progress_bar=True, tb_log_name="PPO_"+str(seed), reset_num_timesteps=False) #training results are stored in tensorboards
    agent.save("agent_default_"+str(seed))
    del agent
    env_train.close()

    print(f"Starting testing phase")
    agent = PPO.load("agent_default_"+str(seed), env = wrapped_env_test)
    reward_over_episodes = []

    #testing the agent
    for episode in range(1000):
        obs, info = wrapped_env_test.reset(seed=seed)
        done = False

        while not done:
            action, _states = agent.predict(obs)
            obs, reward, terminated, truncated, info = wrapped_env_test.step(action)
            done = terminated or truncated
        
        reward_over_episodes.append(wrapped_env_test.return_queue[-1][0])
    
    rewards_over_seeds.append(reward_over_episodes)
    env_test.close()
    print()

# Store the rewards for further analysis
with open('rewards_default', 'wb') as f:  
    pickle.dump(rewards_over_seeds, f)

Starting seed 1
Starting training
Starting testing phase

Starting seed 2
Starting training
Starting testing phase

Starting seed 3
Starting training
Starting testing phase



# Changed Agent Training and Testing Implementation

training 3 agents on our custom environment, following the exact same structure

In [1]:
rewards_over_seeds = []

for seed in [1,2,3]:
    print(f"Starting seed {seed}")
    steps_in_seed = 0
    env_train_1 = gym.make("Humanoid-v4", healthy_reward = 10, forward_reward_weight = 0.625,  healthy_z_range=(0.9,2.0))
    env_train_2 = gym.make("Humanoid-v4", healthy_reward = 2.5, forward_reward_weight = 2.5, healthy_z_range=(0.9,2.0))
    env_test = gym.make("Humanoid-v4", healthy_z_range=(0.9,2.0))
    # Record each episode reward
    wrapped_env_train_1 = gym.wrappers.RecordEpisodeStatistics(env_train_1)
    wrapped_env_train_2 = gym.wrappers.RecordEpisodeStatistics(env_train_2)
    wrapped_env_test = gym.wrappers.RecordEpisodeStatistics(env_test)

    # Set seed
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)

    print(f"Starting 1st phase of training")
    agent = PPO('MlpPolicy', wrapped_env_train_1, verbose=0, tensorboard_log="./logs_changed")
    agent.learn(total_timesteps=700000,progress_bar=True, tb_log_name="PPO_1_"+str(seed), reset_num_timesteps=False)
    agent.save("agent_changed_1_"+str(seed))
    del agent
    env_train_1.close()
    
    print(f"Starting 2nd phase of training")
    agent = PPO.load("agent_changed_1_"+str(seed), env = wrapped_env_train_2)
    agent.learn(total_timesteps=1300000,progress_bar=True, tb_log_name="PPO_2_"+str(seed), reset_num_timesteps=False)
    agent.save("agent_changed_2_"+str(seed))
    del agent
    env_train_2.close()

    print(f"Starting testing phase")
    agent = PPO.load("agent_changed_2_"+str(seed), env = wrapped_env_test)
    reward_over_episodes = []


    for episode in range(1000):
        obs, info = wrapped_env_test.reset(seed=seed)
        done = False

        while not done:
            action, _states = agent.predict(obs)
            obs, reward, terminated, truncated, info = wrapped_env_test.step(action)
            done = terminated or truncated
            #wrapped_env_test.render()
        
        reward_over_episodes.append(wrapped_env_test.return_queue[-1][0])
    
    rewards_over_seeds.append(reward_over_episodes)
    env_test.close()
    print()

# Store the rewards for further analysis
with open('rewards_changed', 'wb') as f:  
    pickle.dump(rewards_over_seeds, f)

Starting seed 1
Starting 1st phase of training
Starting 2nd phase of training
Starting test phase

Starting seed 2
Starting 1st phase of training
Starting 2nd phase of training
Starting test phase

Starting seed 3
Starting 1st phase of training
Starting 2nd phase of training
Starting test phase

