In [9]:
import numpy as np
import gym
import os

from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

import uniform_attack

In [10]:
# Load model
current_dir = os.getcwd()
save_dir = os.path.abspath(os.path.join(current_dir, os.pardir)) + "/ppo/models/"
model = PPO.load(save_dir + "/cartpole-v1/cartpole_v1_ppo_15000")

In [11]:
# Set environment
env = gym.make('CartPole-v1')

In [12]:
# Evaluate model - for info on why the env is wrapped with Monitor check the evaluate_policy function
mean_reward, std_reward = evaluate_policy(model, Monitor(env), n_eval_episodes=100)
print(f"mean_reward : {mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward : 500.00 +/- 0.00


In [16]:
num_episodes = 100
all_episodes_rewards = []
all_episodes_perturbation = []

for i in range(num_episodes):
    episode_rewards = []
    episode_perturbations = []
    done = False
    obs = env.reset()
    
    while not done:

        ### uniform attack ###
        adversarial_sample, perturbation = uniform_attack.perturbate(env, obs, [.01, 0.75])
        ### uniform attack ###        
        
        action, _states = model.predict(adversarial_sample)
        obs, reward, done, info = env.step(action)
        episode_rewards.append(reward)
        episode_perturbations.append(perturbation)

    all_episodes_rewards.append(sum(episode_rewards))
    all_episodes_perturbation.append(sum(episode_perturbations))

print(f"Finished!")
print(f"Final mean reward: {np.mean(all_episodes_rewards):.2f}")
print(f"Mean perturbation: {np.mean(all_episodes_perturbation):.2f}")
print(f"Num episodes: {num_episodes}")

Finished!
Final mean reward: 10.31
Mean perturbation: 1.96
Num episodes: 100
