In [1]:
!pip install mujoco

# Configure MuJoCo to use the EGL rendering backend (requires GPU)
print('Setting environment variable to use GPU rendering:')
%env MUJOCO_GL=egl

# Check if installation was succesful.
try:
  print('Checking that the installation succeeded:')
  import mujoco
  mujoco.MjModel.from_xml_string('<mujoco/>')
except Exception as e:
  raise e from RuntimeError(
      'Something went wrong during installation. Check the shell output above '
      'for more information.\n'
      'If using a hosted Colab runtime, make sure you enable GPU acceleration '
      'by going to the Runtime menu and selecting "Choose runtime type".')

print('Installation successful.')

# Other imports and helper functions
import time
import itertools
import numpy as np

# Graphics and plotting.
print('Installing mediapy:')
!command -v ffmpeg >/dev/null || (apt update && apt install -y ffmpeg)
!pip install -q mediapy
import mediapy as media
import matplotlib.pyplot as plt

# More legible printing from numpy.
np.set_printoptions(precision=3, suppress=True, linewidth=100)

from IPython.display import clear_output
clear_output()


In [2]:
!pip install gymnasium
!pip install stable_baselines3

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1
Collecting stable_baselines3
  Downloading stable_baselines3-2.3.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.2/182.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13->stable_baselines3)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13->stable_baselines3)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.wh

In [3]:
import gymnasium as gym
from stable_baselines3 import SAC, TD3, A2C, DQN, PPO
import os
import argparse
import sys
from pathlib import Path
import torch
import numpy as np
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [4]:
def numerical_grad_of_reward_wrt_obs(obs, model, env, rollout_len=10, delta=1e-4):
    grad = np.zeros(obs.shape)
    for i in range(obs.shape[0]):
        obs_plus = obs.copy()
        obs_minus = obs.copy()
        obs_plus[i] += delta
        obs_minus[i] -= delta

        action_plus, _ = model.predict(obs_plus)
        base_env = deepcopy(env)
        result = env.step(action_plus)
        obs_new_plus = result[0]
        reward_plus = result[1]
        for j in range(rollout_len - 1):
            action_plus, _ = model.predict(obs_new_plus)
            result = env.step(action_plus)
            obs_new_plus = result[0]
            reward_plus += result[1]
        reward_plus = reward_plus / rollout_len
        env = deepcopy(base_env)

        action_minus, _ = model.predict(obs_minus)
        base_env = deepcopy(env)
        result = env.step(action_minus)
        obs_new_minus = result[0]
        reward_minus = result[1]
        for j in range(rollout_len - 1):
            action_minus, _ = model.predict(obs_new_minus)
            result = env.step(action_minus)
            obs_new_minus = result[0]
            reward_minus += result[1]
        reward_minus = reward_minus / rollout_len
        env = deepcopy(base_env)

        grad[i] = (reward_plus - reward_minus) / (2*delta)

    return grad

In [5]:
def train(env_name, num_epochs, timesteps):
    model = PPO('MlpPolicy', env_name, verbose=1, device='cpu', tensorboard_log=log_dir)

    for i in range(num_epochs):

        model.learn(total_timesteps=timesteps, reset_num_timesteps=False)
        model.save(f"{model_dir}/{'PPO'}_{timesteps * (i+1)}")

    return model

In [None]:
env_name = "HalfCheetah-v4"
model_dir = "models_" + env_name
log_dir = "logs_" + env_name
num_epochs = 20
num_iters = 25000
num_test_steps = 100
index = num_epochs * num_iters
model = train(env_name, num_epochs, num_iters)

In [7]:
def projected_gradient_descent(obs, model, env, v, num_attack_iters=10, rollout_len=10, epsilon=0.05, alpha=3e-7, delta=1e-4):
    original_obs = obs.copy()
    for i in range(num_attack_iters):
        grad = numerical_grad_of_reward_wrt_obs(obs, model, env, rollout_len, delta)
        obs = obs - alpha * grad
        clipped_obs = np.clip(obs, original_obs - v*epsilon, original_obs + v*epsilon)
    return clipped_obs

def fgsm(obs,  model, env, v, rollout_len=10, epsilon=0.05, delta=1e-4):
    signed_grad = np.sign(numerical_grad_of_reward_wrt_obs(obs, model, env, rollout_len, delta))
    scaled_grad = signed_grad * v * epsilon
    obs = obs - scaled_grad
    return obs

In [9]:
def test(model, env_name, index, max_steps, randomized_smoothing=False, lamda=0.001, Sigma=None, num_smoothed_samples=10, adversarial_attack=None,
         adversarial_mode=None, v=None, rollout_len=10, epsilon=0.05, alpha=3e-7, delta=1e-4, num_attack_iters=10):

    env = gym.make(env_name)
    obs = env.reset()[0]
    reward_sum = 0
    num_steps = 0
    model.set_env(env)

    if v is None:
        v = np.ones(obs.shape[0])
    if Sigma is None:
        Sigma = np.eye(obs.shape[0])

    if adversarial_attack == "Start State":
        if adversarial_mode == "PGD":
            adversarial_obs = projected_gradient_descent(obs, model, env, v, num_attack_iters, rollout_len, epsilon, alpha, delta)
        else:
            adversarial_obs = fgsm(obs, model, env, v, rollout_len, epsilon, delta)
        obs = env.reset()[0]
        obs = adversarial_obs.copy()

    terminated = False
    while not terminated:
        obs_predict = obs.copy()

        if adversarial_attack == "Observation Corruption":
            if adversarial_mode == "PGD":
                adversarial_obs = projected_gradient_descent(obs_predict, model, env, v, num_attack_iters, rollout_len, epsilon, alpha, delta)
            else:
                adversarial_obs = fgsm(obs_predict, model, env, v, rollout_len, epsilon, delta)
            obs_predict = env.reset()[0]
            obs_predict = adversarial_obs.copy()

        if randomized_smoothing:
            action_total = 0
            for i in range(num_smoothed_samples):
                noise = np.random.multivariate_normal(np.zeros(obs.shape[0]), lamda*Sigma)
                obs_noised = obs_predict + noise
                action, _ = model.predict(obs_noised)
                action_total += action
            if env_name == "CartPole-v1":
                action = int(round(action_total/num_smoothed_samples,0))
            else:
                action = action_total/num_smoothed_samples
        else:
            action, _ = model.predict(obs_predict)

        obs, reward, done, _, _ = env.step(action)
        if env_name != "CartPole-v1":
            reward_sum += reward
        num_steps += 1
        if num_steps >= max_steps or done:
            terminated = True

    if env_name == "CartPole-v1":
        return num_steps
    else:
        return reward_sum

In [10]:
# Generate Samples to Compute Vector v of Value Ranges at each Dimension
model = PPO.load(f"{model_dir}/{'PPO'}_{num_epochs*num_iters}")
env = gym.make(env_name)
obs = env.reset()[0]
action, _ = model.predict(obs)
num_trajectories = 100
iterations_per_trajectory = 2500
recorded_obs = np.zeros((num_trajectories*iterations_per_trajectory, obs.shape[0]))
if env_name == "CartPole-v1":
    recorded_action = np.zeros((num_trajectories*iterations_per_trajectory))
else:
    recorded_action = np.zeros((num_trajectories*iterations_per_trajectory, action.shape[0]))
for i in range(num_trajectories):
    obs = env.reset()[0]
    for j in range(iterations_per_trajectory):
        action, _ = model.predict(obs)
        obs = obs.reshape((1, obs.shape[0]))
        obs, _, done, _, _ = env.step(action)

        recorded_obs[i*iterations_per_trajectory+j] = obs.copy()
        recorded_action[i*iterations_per_trajectory+j] = action.copy()

        if done:
            obs = env.reset()[0]

In [11]:
high_percentile_value = 99
low_percentile_value = 1
percentile_high = np.percentile(recorded_obs, high_percentile_value, axis=0)
percentile_low = np.percentile(recorded_obs, low_percentile_value, axis=0)
v = percentile_high - percentile_low
Sigma = np.cov(recorded_obs.T)

In [32]:
# Test Naive Model
naive_model = PPO('MlpPolicy', env_name, verbose=1, device='cpu', tensorboard_log=log_dir)
result = test(naive_model, env_name, index, num_test_steps)

print("Reward for Naive Model Performance is " + str(result))

Using cpu device
Creating environment from the given name 'HalfCheetah-v4'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Reward for Naive Model Performance is -24.167888357498498


In [33]:
# Test Baseline
model = PPO.load(f"{model_dir}/{'PPO'}_{num_epochs*num_iters}")
result = test(model, env_name, index, num_test_steps)

print("Reward for Baseline Performance is " + str(result))

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Reward for Baseline Performance is 57.51997872265183


In [14]:
# Test Adversarial Start State FGSM
model = PPO.load(f"{model_dir}/{'PPO'}_{num_epochs*num_iters}")
result = test(model, env_name, index, num_test_steps, adversarial_attack="Start State", adversarial_mode="FGSM", v=v, epsilon=0.3)

print("Reward for Adversarially Attacked Performance Using Start State with FGSM is " + str(result))

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Reward for Adversarially Attacked Performance Using Start State with FGSM is 40.205875310216854


In [15]:
# Test Adversarial Start State PGD
model = PPO.load(f"{model_dir}/{'PPO'}_{num_epochs*num_iters}")
result = test(model, env_name, index, num_test_steps, adversarial_attack="Start State", adversarial_mode="PGD", v=v, epsilon=0.3)

print("Reward for Adversarially Attacked Performance Using Start State with PGD is " + str(result))

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Reward for Adversarially Attacked Performance Using Start State with PGD is 17.363172384901354


In [16]:
# Test Adversarial Observation Corruption - FGSM
model = PPO.load(f"{model_dir}/{'PPO'}_{num_epochs*num_iters}")
result = test(model, env_name, index, num_test_steps, adversarial_attack="Observation Corruption", adversarial_mode="FGSM", v=v, epsilon=0.3)

print("Reward for Adversarially Attacked Performance Using Observation Corruption with FGSM is " + str(result))

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Reward for Adversarially Attacked Performance Using Observation Corruption with FGSM is -8.19511689431128


In [17]:
# Test Adversarial Observation Corruption - PGD
model = PPO.load(f"{model_dir}/{'PPO'}_{num_epochs*num_iters}")
result = test(model, env_name, index, num_test_steps, adversarial_attack="Observation Corruption", adversarial_mode="PGD", v=v, epsilon=0.3)

print("Reward for Adversarially Attacked Performance Using Observation Corruption with PGD is " + str(result))

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Reward for Adversarially Attacked Performance Using Observation Corruption with PGD is -35.31177856081131


In [36]:
# Test Baseline Smoothed
model = PPO.load(f"{model_dir}/{'PPO'}_{num_epochs*num_iters}")
result = test(model, env_name, index, num_test_steps, randomized_smoothing=True, Sigma=Sigma)

print("Reward for Baseline Performance with Smoothing is " + str(result))

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Reward for Baseline Performance with Smoothing is 68.62891560832252


In [40]:
# Test Adversarial Start State with Smoothing - FGSM
model = PPO.load(f"{model_dir}/{'PPO'}_{num_epochs*num_iters}")
result = test(model, env_name, index, num_test_steps, randomized_smoothing=True, Sigma=Sigma, epsilon=0.3, adversarial_attack="Start State", adversarial_mode="FGSM", v=v)

print("Reward for Adversarially Attacked and Smoothed Performance Using Start State with FGSM is " + str(result))

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Reward for Adversarially Attacked and Smoothed Performance Using Start State with FGSM is 57.844125105432376


In [41]:
# Test Adversarial Start State with Smoothing - PGD
model = PPO.load(f"{model_dir}/{'PPO'}_{num_epochs*num_iters}")
result = test(model, env_name, index, num_test_steps, randomized_smoothing=True, Sigma=Sigma, epsilon=0.3, adversarial_attack="Start State", adversarial_mode="PGD", v=v)

print("Reward for Adversarially Attacked and Smoothed Performance Using Start State with PGD is " + str(result))

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Reward for Adversarially Attacked and Smoothed Performance Using Start State with PGD is 59.348110201208314


In [21]:
# Test Adversarial Observation Corruption with Smoothing - FGSM
model = PPO.load(f"{model_dir}/{'PPO'}_{num_epochs*num_iters}")
result = test(model, env_name, index, num_test_steps, randomized_smoothing=True, Sigma=Sigma, epsilon=0.3, adversarial_attack="Observation Corruption", adversarial_mode="FGSM", v=v)

print("Reward for Adversarially Attacked and Smoothed Performance Using Observation Corruption with FGSM is " + str(result))

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Reward for Adversarially Attacked and Smoothed Performance Using Observation Corruption with FGSM is -1.4928626761952328


In [None]:
# Test Adversarial Observation Corruption with Smoothing - PGD
model = PPO.load(f"{model_dir}/{'PPO'}_{num_epochs*num_iters}")
result = test(model, env_name, index, num_test_steps, randomized_smoothing=True, Sigma=Sigma, epsilon=0.3, adversarial_attack="Observation Corruption", adversarial_mode="PGD", v=v)

print("Reward for Adversarially Attacked and Smoothed Performance Using Observation Corruption with PGD is " + str(result))

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-33.511187574594686
Average Reward for Adversarially Attacked and Smoothed Performance Using Observation Corruption with PGD is -33.511187574594686


In [26]:
if env_name == "HalfCheetah-v4":
    observation_dimension = 17
    action_dimension = 6
else:
    observation_dimension = 376
    action_dimension = 17

hidden_size = 64

class CloneModel(nn.Module):
    def __init__(self, observation_dimension, hidden_size, action_dimension):
        super(CloneModel, self).__init__()
        self.fc1 = nn.Linear(observation_dimension, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_dimension)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize neural network
clone_model = CloneModel(observation_dimension, hidden_size, action_dimension)

# Define Mean Squared Error (MSE) loss
criterion = nn.MSELoss()

# Define optimizer (e.g., stochastic gradient descent)
optimizer = optim.Adam(clone_model.parameters(), lr=1e-3)

# Portion of Data Used As Training (Remaining is Used as Test)
training_portion = 0.9
num_samples = recorded_obs.shape[0]
num_training_samples = int(num_samples*training_portion)
num_test_samples = num_samples - num_training_samples

# Generate some dummy data for training
x_train = torch.tensor(recorded_obs[:num_training_samples]).float()
y_train = torch.tensor(recorded_action[:num_training_samples]).float()
x_test = torch.tensor(recorded_obs[num_training_samples:]).float()
y_test = torch.tensor(recorded_action[num_training_samples:]).float()

# Create a TensorDataset
train_dataset = TensorDataset(x_train, y_train)
test_dataset = TensorDataset(x_test, y_test)

# Create a DataLoader for minibatch training
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Train the neural network
num_epochs_clone = 100
for epoch in range(num_epochs_clone):
    running_train_loss = 0.0
    for inputs, targets in train_loader:
        # Forward pass
        outputs = clone_model(inputs)
        loss = criterion(outputs, targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_train_loss += loss.item() * inputs.size(0)

    # Print progress
    train_loss = running_train_loss / len(train_dataset)
    print(f'Epoch [{epoch+1}/{num_epochs_clone}], Train Loss: {train_loss:.4f}')

    running_test_loss = 0.0
    for inputs, targets in test_loader:
        outputs = clone_model(inputs)
        loss = criterion(outputs, targets)

        running_test_loss += loss.item() * inputs.size(0)

    test_loss = running_test_loss / len(test_dataset)
    print(f'Epoch [{epoch+1}/{num_epochs_clone}], Test Loss: {test_loss:.4f}')

print("Training complete!")

Epoch [1/100], Train Loss: 0.0417
Epoch [1/100], Test Loss: 0.0276
Epoch [2/100], Train Loss: 0.0259
Epoch [2/100], Test Loss: 0.0251
Epoch [3/100], Train Loss: 0.0240
Epoch [3/100], Test Loss: 0.0234
Epoch [4/100], Train Loss: 0.0229
Epoch [4/100], Test Loss: 0.0228
Epoch [5/100], Train Loss: 0.0223
Epoch [5/100], Test Loss: 0.0220
Epoch [6/100], Train Loss: 0.0218
Epoch [6/100], Test Loss: 0.0213
Epoch [7/100], Train Loss: 0.0213
Epoch [7/100], Test Loss: 0.0207
Epoch [8/100], Train Loss: 0.0210
Epoch [8/100], Test Loss: 0.0207
Epoch [9/100], Train Loss: 0.0208
Epoch [9/100], Test Loss: 0.0206
Epoch [10/100], Train Loss: 0.0206
Epoch [10/100], Test Loss: 0.0201
Epoch [11/100], Train Loss: 0.0204
Epoch [11/100], Test Loss: 0.0199
Epoch [12/100], Train Loss: 0.0203
Epoch [12/100], Test Loss: 0.0202
Epoch [13/100], Train Loss: 0.0201
Epoch [13/100], Test Loss: 0.0199
Epoch [14/100], Train Loss: 0.0200
Epoch [14/100], Test Loss: 0.0203
Epoch [15/100], Train Loss: 0.0199
Epoch [15/100], T

In [22]:
def numerical_grad_of_reward_wrt_obs_black_box(obs, model, env, rollout_len=10, delta=1e-4):
    grad = np.zeros(obs.shape)
    for i in range(obs.shape[0]):
        obs_plus = obs.copy()
        obs_minus = obs.copy()
        obs_plus[i] += delta
        obs_minus[i] -= delta

        action_plus = model(torch.tensor(obs_plus).float()).detach().numpy()
        base_env = deepcopy(env)
        result = env.step(action_plus)
        obs_new_plus = result[0]
        reward_plus = result[1]
        for j in range(rollout_len - 1):
            action_plus = model(torch.tensor(obs_new_plus).float()).detach().numpy()
            result = env.step(action_plus)
            obs_new_plus = result[0]
            reward_plus += result[1]
        reward_plus = reward_plus / rollout_len
        env = deepcopy(base_env)

        action_minus = model(torch.tensor(obs_minus).float()).detach().numpy()
        base_env = deepcopy(env)
        result = env.step(action_minus)
        obs_new_minus = result[0]
        reward_minus = result[1]
        for j in range(rollout_len - 1):
            action_minus = model(torch.tensor(obs_new_minus).float()).detach().numpy()
            result = env.step(action_minus)
            obs_new_minus = result[0]
            reward_minus += result[1]
        reward_minus = reward_minus / rollout_len
        env = deepcopy(base_env)

        grad[i] = (reward_plus - reward_minus) / (2*delta)

    return grad

In [23]:
def projected_gradient_descent_black_box(obs, clone_model, env, v, num_attack_iters=10, rollout_len=10, epsilon=0.05, alpha=3e-7, delta=1e-4):
    original_obs = obs.copy()
    for i in range(num_attack_iters):
        grad = numerical_grad_of_reward_wrt_obs_black_box(obs, clone_model, env, rollout_len, delta)
        obs = obs - alpha * grad
        clipped_obs = np.clip(obs, original_obs - v*epsilon, original_obs + v*epsilon)
    return clipped_obs

def fgsm_black_box(obs, clone_model, env, v, rollout_len=10, epsilon=0.05, delta=1e-4):
    signed_grad = np.sign(numerical_grad_of_reward_wrt_obs_black_box(obs, clone_model, env, rollout_len, delta))
    scaled_grad = signed_grad * v * epsilon
    obs = obs - scaled_grad
    return obs

In [24]:
def test_black_box(clone_model, black_box_model, env_name, index, max_steps, randomized_smoothing=False, lamda=0.001, Sigma=None, num_smoothed_samples=10, adversarial_attack=None,
         adversarial_mode=None, v=None, rollout_len=10, epsilon=0.05, alpha=3e-7, delta=1e-4, num_attack_iters=10):

    env = gym.make(env_name)
    obs = env.reset()[0]
    reward_sum = 0
    num_steps = 0
    black_box_model.set_env(env)

    if v is None:
        v = np.ones(obs.shape[0])
    if Sigma is None:
        Sigma = np.eye(obs.shape[0])

    if adversarial_attack == "Start State":
        if adversarial_mode == "PGD":
            adversarial_obs = projected_gradient_descent_black_box(obs, clone_model, env, v, num_attack_iters, rollout_len, epsilon, alpha, delta)
        else:
            adversarial_obs = fgsm_black_box(obs, clone_model, env, v, rollout_len, epsilon, delta)
        obs = env.reset()[0]
        obs = adversarial_obs.copy()

    terminated = False
    while not terminated:
        obs_predict = obs.copy()
        action, _ = black_box_model.predict(obs_predict)

        obs, reward, done, _, _ = env.step(action)
        reward_sum += reward
        num_steps += 1
        if num_steps >= max_steps or done:
            terminated = True

    return reward_sum

In [46]:
# Test Black Box Adversarial Start State FGSM
black_box_model = PPO.load(f"{model_dir}/{'PPO'}_{num_epochs*num_iters}")
result = test_black_box(clone_model, black_box_model, env_name, index, num_test_steps, adversarial_attack="Start State", adversarial_mode="FGSM", v=v, epsilon=0.3)

print("Reward for Black Box Adversarially Attacked Performance Using Start State with FGSM is " + str(result))

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Reward for Black Box Adversarially Attacked Performance Using Start State with FGSM is 48.29666714369076


In [47]:
# Test Black Box Adversarial Start State PGD
black_box_model = PPO.load(f"{model_dir}/{'PPO'}_{num_epochs*num_iters}")
result = test_black_box(clone_model, black_box_model, env_name, index, num_test_steps, adversarial_attack="Start State", adversarial_mode="PGD", v=v, epsilon=0.3)

print("Reward for Black Box Adversarially Attacked Performance Using Start State with PGD is " + str(result))

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Reward for Black Box Adversarially Attacked Performance Using Start State with PGD is 38.48371348941361
