In [91]:
import numpy as np
import math
import gym

In [29]:
from stable_baselines3 import PPO

In [30]:
# Load model
save_dir = "./models/ppo"
model = PPO.load(save_dir + f"/cartpole-v1/cartpole_v1_ppo_100k")

In [37]:
# Set environment
env = gym.make('CartPole-v1')

In [31]:
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

In [33]:
# Evaluate model - for info on why the env is wrapped with Monitor check the evaluate_policy function
mean_reward, std_reward = evaluate_policy(model, Monitor(env), n_eval_episodes=100)
print(f"mean_reward : {mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward : 500.00 +/- 0.00


In [194]:
# Uniform attack on CartPole-v1
def pertubate(env, obs):
    """Pertubate the observation to hinder the agent."""

    # Threshholds
    # 2.4 for x
    # 12 * 2 * math.pi / 360 for theta
    x_threshold = env.x_threshold
    theta_threshold = env.theta_threshold_radians
    
    # Get the cart's x position and pole's angle 
    x = obs[0]
    theta = obs[2]

    # Calculate which variable is closer to termination condition
    x_danger = np.absolute((x_threshold - np.absolute(x)) / x_threshold)
    theta_danger = np.absolute((theta_threshold / 360 - np.absolute(theta)) / theta_threshold)
    
    # print(f"x: {x}, danger: {x_danger}")
    # print(f"theta: {theta}, danger: {theta_danger}")

    # Actions are encoded as 
    # '0' -> push cart to the left
    # '1' -> push card to the right

    # Calculate the target action to minimize reward
    # The following favors theta over x in case of equal danger
    if x_danger > theta_danger:
        if x < 0:
            target_action =  0 # If x is already to the left go further to the left
        elif x > 0:
            target_action =  1 # If x is already to the right go further to the right
        else: 
            target_action =  np.random.choice([0, 1]) # Note that this case suggests that x = theta = 0
    else:
        if theta < 0:
            target_action =  1 # If the pole is angled to the left -> go to the right
        elif theta > 0:
            target_action =  0 # If the pole is angled to the right -> go to the left
        else:
            target_action =  np.random.choice([0, 1]) # Note that this case suggests that x = theta = 0

    # Generate adversarial sample to trick the agent to select target action
    x = -obs[0]
    x_dot = -obs[1]
    theta = -obs[2]
    theta_dot = -obs[3]
    
    # CartPole-v1's state: (x, x_dot, theta, theta_dot)
    state = (x, x_dot, theta, theta_dot)
    
    # Construct return value according to CartPole-v1 syntax
    obs = np.array(state, dtype=np.float32)

    return obs

In [200]:
num_episodes = 100
all_episodes_rewards = []

for i in range(num_episodes):
    episode_rewards = []
    done = False
    obs = env.reset()
    
    while not done:

        ### uniform attack ###
        pertubated_obs = pertubate(env, obs)
        ### uniform attack ###        
        
        action, _states = model.predict(pertubated_obs)
        obs, reward, done, info = env.step(action)
        episode_rewards.append(reward)

    all_episodes_rewards.append(sum(episode_rewards))
    # print(f"Mean reward over evaluation iteration {i}: {np.mean(sum(episode_rewards))}")
    # print(f"Updated mean reward over all iterations: {np.mean(all_episodes_rewards)}")
    # print()

print(f"Finished!\nFinal mean reward: {np.mean(all_episodes_rewards)}\nNum episodes: {num_episodes}")

Finished!
Final mean reward: 8.84
Num episodes: 100
