In [108]:
import os
import gym
from stable_baselines3 import PPO, TD3, DDPG, A2C , DQN, SAC
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import numpy as np
import time
algorithm_name = 'A2C'

from gym.envs.box2d import BipedalWalker

#this code is different from the usual code as it defines the bipedal walker environment
#as its hardcore version

class BipedalWalkerHardcore(BipedalWalker):
    hardcore = True

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        

environment_name = 'BipedalWalkerHardcore-v3'
gym.register(
    id=environment_name,
    entry_point='__main__:BipedalWalkerHardcore',
    max_episode_steps=1600,
    reward_threshold=300,
)

env = gym.make(environment_name)


In [109]:
episodes = 10
scores = []

for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        #env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
    
    scores.append(score)
    print('Episode:{} Score:{}'.format(episode, score))

env.close()


#below will calculate and print mean score and standard deviation, copy and paste this into a csv file for later
#data analysis
mean_score = np.mean(scores)
std_score = np.std(scores)
print('Mean Average Score: {:.2f}'.format(mean_score))
print('Standard Deviation: {:.2f}'.format(std_score))

Episode:1 Score:-116.65975143558842
Episode:2 Score:-82.28484027651739
Episode:3 Score:-108.79024443098399
Episode:4 Score:-103.77837833104911
Episode:5 Score:-102.94512332658967
Episode:6 Score:-114.04291414284458
Episode:7 Score:-79.13657873619398
Episode:8 Score:-121.3063411835699
Episode:9 Score:-105.42631537883946
Episode:10 Score:-104.00392704425938
Mean Average Score: -103.84
Standard Deviation: 12.95


In [84]:
env.close()

In [104]:
log_path = os.path.join('Training', 'Logs')

if algorithm_name == 'PPO':
    model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
elif algorithm_name == 'TD3':
    model = TD3('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
elif algorithm_name == 'DDPG':
    model = DDPG('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
elif algorithm_name == 'A2C':
    model = A2C('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
elif algorithm_name == 'DQN':
    model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
elif algorithm_name == 'SAC':
    model = SAC('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
else:
    raise ValueError(f"Invalid algorithm name: {algorithm_name}")

env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [105]:
start_time = time.time()
number_of_episodes = 20000


model.learn(total_timesteps=number_of_episodes)

end_time = time.time()
elapsed_time_ms = (end_time - start_time) * 1000
elapsed_time_sec = elapsed_time_ms / 1000

print("Training time: {:.3f} sec".format(elapsed_time_sec))

Logging to Training/Logs/A2C_52
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 73.2     |
|    ep_rew_mean        | -107     |
| time/                 |          |
|    fps                | 899      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -5.65    |
|    explained_variance | -4.12    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -1.73    |
|    std                | 0.992    |
|    value_loss         | 0.0942   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 73.2     |
|    ep_rew_mean        | -107     |
| time/                 |          |
|    fps                | 940      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    t

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 491      |
|    ep_rew_mean        | -114     |
| time/                 |          |
|    fps                | 981      |
|    iterations         | 1400     |
|    time_elapsed       | 7        |
|    total_timesteps    | 7000     |
| train/                |          |
|    entropy_loss       | -5.19    |
|    explained_variance | -5.81    |
|    learning_rate      | 0.0007   |
|    n_updates          | 1399     |
|    policy_loss        | -0.346   |
|    std                | 0.887    |
|    value_loss         | 0.00905  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 584      |
|    ep_rew_mean        | -116     |
| time/                 |          |
|    fps                | 983      |
|    iterations         | 1500     |
|    time_elapsed       | 7        |
|    total_timesteps    | 7500     |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 467      |
|    ep_rew_mean        | -110     |
| time/                 |          |
|    fps                | 984      |
|    iterations         | 2700     |
|    time_elapsed       | 13       |
|    total_timesteps    | 13500    |
| train/                |          |
|    entropy_loss       | -4.86    |
|    explained_variance | -0.427   |
|    learning_rate      | 0.0007   |
|    n_updates          | 2699     |
|    policy_loss        | 0.952    |
|    std                | 0.816    |
|    value_loss         | 0.0719   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 467      |
|    ep_rew_mean        | -110     |
| time/                 |          |
|    fps                | 984      |
|    iterations         | 2800     |
|    time_elapsed       | 14       |
|    total_timesteps    | 14000    |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 289      |
|    ep_rew_mean        | -107     |
| time/                 |          |
|    fps                | 983      |
|    iterations         | 4000     |
|    time_elapsed       | 20       |
|    total_timesteps    | 20000    |
| train/                |          |
|    entropy_loss       | -4.57    |
|    explained_variance | 0.223    |
|    learning_rate      | 0.0007   |
|    n_updates          | 3999     |
|    policy_loss        | -0.602   |
|    std                | 0.76     |
|    value_loss         | 0.0327   |
------------------------------------
Training time: 20.340 sec


#### model_name = f"{algorithm_name}_{environment_name}_{number_of_episodes}"
Algorithm_Path = os.path.join('Training', 'Saved Models', model_name)

model.save(Algorithm_Path)

In [106]:
model_name = f"{algorithm_name}{environment_name}{number_of_episodes}"
Algorithm_Path = os.path.join('Training', 'Saved Models', model_name)

model.save(Algorithm_Path)

In [107]:
episodes = 5
scores = []
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        #env.render()
        action, _ = model.predict(obs) # WE ARE NOW USING OUR MODEL
        obs, reward, done, info = env.step(action)
        score += reward
    
    print('Episode:{} Score:{}'.format(episode, score))
    scores.append(score)
    
env.close()


mean_score = sum(scores) / len(scores)
std_dev = (sum((score - mean_score)**2 for score in scores) / len(scores))**0.5

print(f"Mean Score: {mean_score}")
print(f"Standard Deviation: {std_dev}")



#del model

Episode:1 Score:[-101.453575]
Episode:2 Score:[-100.47365]
Episode:3 Score:[-99.83355]
Episode:4 Score:[-135.19908]
Episode:5 Score:[-101.814316]
Mean Score: [-107.75484]
Standard Deviation: [13.740048]


In [102]:
del model # run this if you want to delete the model. the reason we are deleting the model
#is because we don't really need the neural networks after we have benchmarked them and recorded the data