In [1]:
import numpy as np
import gym




In [2]:
from stable_baselines3 import PPO

In [3]:
# Load model
save_dir = "./models/ppo"
model = PPO.load(save_dir + f"/cartpole-v1/cartpole_v1_ppo_100k")

In [4]:
# Set environment
env = gym.make('CartPole-v1')

In [5]:
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

In [6]:
# Evaluate model - for info on why the env is wrapped with Monitor check the evaluate_policy function
mean_reward, std_reward = evaluate_policy(model, Monitor(env), n_eval_episodes=100)
print(f"mean_reward : {mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward : 500.00 +/- 0.00


In [48]:
# Uniform attack on CartPole-v1
def perturbate(env, obs, perturbation_weight):
    """Perturbate the observation to hinder the agent."""

    # Threshholds
    # 2.4 for x
    # 12 * 2 * math.pi / 360 for theta
    x_threshold = env.x_threshold
    theta_threshold = env.theta_threshold_radians
    
    # Get the cart's x position and pole's angle 
    x = obs[0]
    theta = obs[2]

    # Calculate which variable is closer to termination condition
    x_danger = np.absolute((x_threshold - np.absolute(x)) / x_threshold)
    theta_danger = np.absolute((theta_threshold / 360 - np.absolute(theta)) / theta_threshold)
    
    # print(f"x: {x}, danger: {x_danger}")
    # print(f"theta: {theta}, danger: {theta_danger}")

    # Actions are encoded as 
    # '0' -> push cart to the left
    # '1' -> push card to the right

    # Calculate the target action to minimize reward
    # The following favors theta over x in case of equal danger
    if x_danger > theta_danger:
        if x < 0:
            # If x is already to the left -> target action: move left
            # Therefore pretend x is more to the right and pole is angled more to the left
            x_perturbation = 1 * perturbation_weight[0]
            theta_perturbation = -1 * perturbation_weight[1]
        else:
            x_perturbation = -1 * perturbation_weight[0]
            theta_perturbation = 1 * perturbation_weight[1]
            
    else:
        if theta < 0:
            # Pole is angled to the left -> target action: move right
            # Therefore pretend x is more to the left and pole is angled more to the right
            x_perturbation = -1 * perturbation_weight[0]
            theta_perturbation = 1 * perturbation_weight[1]
        else:
            x_perturbation = 1 * perturbation_weight[0]
            theta_perturbation = -1 * perturbation_weight[1]

    # Generate adversarial sample to trick the agent to select target action
    x = obs[0] + x_perturbation
    x_dot = obs[1]
    theta = obs[2] + theta_perturbation
    theta_dot = obs[3]
    
    # CartPole-v1's state: (x, x_dot, theta, theta_dot)
    state = (x, x_dot, theta, theta_dot)
    
    # Construct return value according to CartPole-v1 syntax
    adversarial_sample = np.array(state, dtype=np.float32)

    total_perturbation = np.absolute(obs - adversarial_sample)

    return adversarial_sample, total_perturbation

In [77]:
num_episodes = 100
all_episodes_rewards = []
all_episodes_perturbation = []

for i in range(num_episodes):
    episode_rewards = []
    episode_perturbations = []
    done = False
    obs = env.reset()
    
    while not done:

        ### uniform attack ###
        adversarial_sample, perturbation = perturbate(env, obs, [.01, 0.75])
        ### uniform attack ###        
        
        action, _states = model.predict(adversarial_sample)
        obs, reward, done, info = env.step(action)
        episode_rewards.append(reward)
        episode_perturbations.append(perturbation)

    all_episodes_rewards.append(sum(episode_rewards))
    all_episodes_perturbation.append(sum(episode_perturbations))
    # print(f"Mean reward over evaluation iteration {i}: {np.mean(sum(episode_rewards))}")
    # print(f"Updated mean reward over all iterations: {np.mean(all_episodes_rewards)}")
    # print()

print(f"Finished!")
print(f"Final mean reward: {np.mean(all_episodes_rewards):.2f}")
print(f"Mean perturbation: {np.mean(all_episodes_perturbation):.2f}")
print(f"Num episodes: {num_episodes}")

Finished!
Final mean reward: 9.42
Mean perturbation: 1.79
Num episodes: 100
