In [None]:
import os
import gym
from stable_baselines3 import PPO, TD3, DDPG, A2C , DQN, SAC
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import numpy as np
import time
algorithm_name = "A2C"
environment_name = 'CartPole-v1'
#environment_names = ['CartPole-v1', 'Acrobot-v1',
#'MountainCar-v0', 'MountainCarContinuous-v0', 'LunarLander-v2', 
#'BipedalWalker-v3', 'CarRacing-v0', Pendulum-v1]
#"BipedalWalker-v3", hardcore=True to make bipedal hardcore environment
env = gym.make(environment_name)


def new_reward_function(state, action, next_state):

    cart_position = state[0]

    reward = 1 - abs(cart_position)

    return reward

env.reward_func = new_reward_function



episodes = 5
scores = []

for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        #env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
    
    scores.append(score)
    print('Episode:{} Score:{}'.format(episode, score))

env.close()

mean_score = np.mean(scores)
std_score = np.std(scores)

print('Mean Average Score: {:.2f}'.format(mean_score))
print('Standard Deviation: {:.2f}'.format(std_score))



log_path = os.path.join('Training', 'Logs')

if algorithm_name == 'PPO':
    model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
elif algorithm_name == 'TD3':
    model = TD3('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
elif algorithm_name == 'DDPG':
    model = DDPG('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
elif algorithm_name == 'A2C':
    model = A2C('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
elif algorithm_name == 'DQN':
    model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
elif algorithm_name == 'SAC':
    model = SAC('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
else:
    raise ValueError(f"Invalid algorithm name: {algorithm_name}")

env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])


start_time = time.time()
number_of_episodes = 100000


model.learn(total_timesteps=number_of_episodes)

end_time = time.time()
elapsed_time_ms = (end_time - start_time) * 1000
elapsed_time_sec = elapsed_time_ms / 1000

print("Training time: {:.3f} sec".format(elapsed_time_sec))


model_name = f"{algorithm_name}_{environment_name}_{number_of_episodes}"
Algorithm_Path = os.path.join('Training', 'Saved Models', model_name)
#Algorithm_Path = os.path.join('Training', 'Saved Models', 'PPO_CartPole-v1_10000')
model.save(Algorithm_Path)



episodes = 5
scores = []
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        
        env.render()
        action, _ = model.predict(obs) # WE ARE NOW USING OUR MODEL
        obs, reward, done, info = env.step(action)
        score += reward
    
    print('Episode:{} Score:{}'.format(episode, score))
    scores.append(score)
    
env.close()


mean_score = sum(scores) / len(scores)
std_dev = (sum((score - mean_score)**2 for score in scores) / len(scores))**0.5

print(f"Mean Score: {mean_score}")
print(f"Standard Deviation: {std_dev}")


#comment this code in and out
#del model
