In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
%cd /content/drive/My\ Drive/adversarial_attacks_DRL

/content/drive/My Drive/adversarial_attacks_DRL


In [None]:
%cd baselines
!pip install .
% cd ..

In [None]:
!pip install git+https://github.com/thu-ml/tianshou.git@master
!pip install advertorch

# Part 0
Create a **Pong** environent and import the required libraries

In [24]:
from advertorch.attacks import *
from atari_wrapper import wrap_deepmind
import copy
import torch
from drl_attacks.uniform_attack import uniform_attack_collector
from utils import A2CPPONetAdapter

In [25]:
def make_atari_env_watch(env_name):
    return wrap_deepmind(env_name, frame_stack=4,
                         episode_life=False, clip_rewards=False)

# define Pong Atari environment
env = make_atari_env_watch("PongNoFrameskip-v4")
state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.env.action_space.shape or env.env.action_space.n

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Part 1
Attack **Pong-PPO** policy with **Uniform Attack** with 3 different attack frequencies: 0, 0.5, 1.

In [26]:
# load pretrained Pong-PPO policy 
ppo_pong_path = "log/PongNoFrameskip-v4/ppo/policy.pth"
ppo_policy, _ = torch.load(ppo_pong_path)
ppo_policy.to(device).init(device)

# adapt PPO policy to Advertorch library
ppo_adv_net = A2CPPONetAdapter(copy.deepcopy(ppo_policy)).to(device)
ppo_adv_net.eval()

# define image adversarial attack
eps = 0.1
obs_adv_atk = GradientSignAttack(ppo_adv_net, eps=eps*255,
                                 clip_min=0, clip_max=255, targeted=False)

# define RL adversarial attack
collector = uniform_attack_collector(policy, env, obs_adv_atk,
                                     perfect_attack=False,
                                     atk_frequency=0.5,
                                     device=device)

In [27]:
# perform uniform attack with attack frequency of 0.5
collector.atk_frequency = 0.5
test_adversarial_policy = collector.collect(n_episode=10)
avg_atk_rate = test_adversarial_policy['atk_rate(%)']
avg_rew = test_adversarial_policy['rew']
avg_num_atks = test_adversarial_policy['n_atks']
avg_succ_atks_rate = test_adversarial_policy['succ_atks(%)']
print("attack frequency (%) =", avg_atk_rate)
print("number of attacks =", avg_num_atks)
print("number of successful attacks (%) =", avg_succ_atks_rate)
print("reward =", avg_rew)

attack frequency (%) = 0.4958366693354684
number of attacks = 619.3
number of successful attacks (%) = 1.0
reward = -18.1


In [None]:
# perform uniform attack with attack frequency of 1
collector.atk_frequency = 1.
test_adversarial_policy = collector.collect(n_episode=10)
avg_atk_rate = test_adversarial_policy['atk_rate(%)']
avg_rew = test_adversarial_policy['rew']
avg_num_atks = test_adversarial_policy['n_atks']
avg_succ_atks_rate = test_adversarial_policy['succ_atks(%)']
print("attack frequency (%) =", avg_atk_rate)
print("number of attacks =", avg_num_atks)
print("number of successful attacks (%) =", avg_succ_atks_rate)
print("reward =", avg_rew)

attack frequency (%) = 1.0
number of attacks = 855.3
number of successful attacks (%) = 1.0
reward = -20.9


In [None]:
# perform uniform attack with attack frequency of 0. (no attack is performed)
collector.atk_frequency = 0.
test_adversarial_policy = collector.collect(n_episode=10)
avg_atk_rate = test_adversarial_policy['atk_rate(%)']
avg_rew = test_adversarial_policy['rew']
avg_num_atks = test_adversarial_policy['n_atks']
avg_succ_atks_rate = test_adversarial_policy['succ_atks(%)']
print("attack frequency (%) =", avg_atk_rate)
print("number of attacks =", avg_num_atks)
print("number of successful attacks (%) =", avg_succ_atks_rate)
print("reward =", avg_rew)

attack frequency (%) = 0.0
number of attacks = 0.0
number of successful attacks (%) = 0
reward = 20.8


# Part 2
Attack **Pong-PPO** policy with **Uniform Attack** with attack frequenc7 0.5. Moreover, let's suppose we don't know the agent policy is PPO and let's perform attacks on a **A2C** policy trained on the same environment.

In [28]:
# load pretrained Pong-A2C policy 
a2c_pong_path = "log/PongNoFrameskip-v4/a2c/policy.pth"
a2c_policy, _ = torch.load(a2c_pong_path)
a2c_policy.to(device).init(device)

# adapt PPO policy to Advertorch library
a2c_adv_net = A2CPPONetAdapter(copy.deepcopy(a2c_policy)).to(device)
a2c_adv_net.eval()

# define image adversarial attack
eps = 0.1
obs_adv_atk = GradientSignAttack(a2c_adv_net, eps=eps*255,
                                 clip_min=0, clip_max=255, targeted=False)

# define RL adversarial attack
collector = uniform_attack_collector(policy, env, obs_adv_atk,
                                     perfect_attack=False,
                                     atk_frequency=0.5,
                                     device=device)

In [20]:
# perform uniform attack with attack frequency of 0.5
collector.atk_frequency = 0.5
test_adversarial_policy = collector.collect(n_episode=10)
avg_atk_rate = test_adversarial_policy['atk_rate(%)']
avg_rew = test_adversarial_policy['rew']
avg_num_atks = test_adversarial_policy['n_atks']
avg_succ_atks_rate = test_adversarial_policy['succ_atks(%)']
print("attack frequency (%) =", avg_atk_rate)
print("number of attacks =", avg_num_atks)
print("number of successful attacks (%) =", avg_succ_atks_rate)
print("reward =", avg_rew)

attack frequency (%) = 0.5018479033404406
number of attacks = 706.1
number of successful attacks (%) = 0.7777935136666194
reward = -17.1
