In [None]:
import sys
import logging
from typing import cast
import gym
import numpy as np
import matplotlib.pyplot as plt  # type:ignore
from cyberbattle.agents.baseline.learner import TrainedLearner
import cyberbattle.agents.baseline.plotting as p
import cyberbattle.agents.baseline.agent_wrapper as w
import cyberbattle.agents.baseline.agent_tabularqlearning as a
from cyberbattle.agents.baseline.agent_wrapper import Verbosity
import cyberbattle.agents.baseline.learner as learner
import importlib
import cyberbattle.agents.baseline.agent_dql as dqla
import cyberbattle.agents.baseline.agent_randomcredlookup as rca
import cyberbattle.agents.baseline.agent_ppo as ppo
from cyberbattle._env.defender import ScanAndReimageCompromisedMachines
from cyberbattle._env.cyberbattle_env import AttackerGoal, DefenderConstraint
from typing import cast
from cyberbattle._env.cyberbattle_env import CyberBattleEnv
from cyberbattle._env.cyberbattle_toyctf import CyberBattleToyCtf
from stable_baselines3.a2c.a2c import A2C
from stable_baselines3.ppo.ppo import PPO
from cyberbattle._env.flatten_wrapper import FlattenObservationWrapper, FlattenActionWrapper
import os
from stable_baselines3 import PPO
import cyberbattle.agents.baseline.agent_tabularqlearning as tqa
import cyberbattle.agents.baseline.agent_dql as dqla
import cyberbattle.agents.baseline.agent_ddql as ddqla
import cyberbattle.agents.baseline.agent_dueling_dql as duelingdqla
import cyberbattle.agents.baseline.agent_dueling_ddql as dueling_ddqla
import random
random.seed(120394016)
%matplotlib inline
logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format="%(levelname)s: %(message)s")

In [None]:
# Parameters
gymid = "CyberBattleChain-v0"
iteration_count = 5000
training_episode_count = 50
eval_episode_count = 10 #10
maximum_node_count = 22
maximum_total_credentials = 22
# env_size = 20
env_size = 12
# Load the Gym environment
if env_size:
    gym_env = gym.make(gymid, size=env_size,attacker_goal=AttackerGoal(
                                         own_atleast=0,
                                         own_atleast_percent=1.0
                                     ),
                                     defender_constraint=DefenderConstraint(
                                         maintain_sla=0.80
                                     ),
                                     defender_agent=ScanAndReimageCompromisedMachines(
                                         probability=0.6,
                                         scan_capacity=2,
                                         scan_frequency=5))
else:
    gym_env = gym.make(gymid)

ep = w.EnvironmentBounds.of_identifiers(
    maximum_node_count=maximum_node_count,
    maximum_total_credentials=maximum_total_credentials,
    identifiers=gym_env.identifiers
)



In [None]:
debugging = True
if debugging:
    print(f"port_count = {ep.port_count}, property_count = {ep.property_count}")

    gym_env.environment
    # training_env.environment.plot_environment_graph()
    gym_env.environment.network.nodes
    gym_env.action_space
    gym_env.action_space.sample()
    gym_env.observation_space.sample()
    o0 = gym_env.reset()
    o_test, r, d, i = gym_env.step(gym_env.sample_valid_action())
    o0 = gym_env.reset()

    o0.keys()

    fe_example = w.RavelEncoding(ep, [w.Feature_active_node_properties(ep), w.Feature_discovered_node_count(ep)])
    a = w.StateAugmentation(o0)
    w.Feature_discovered_ports(ep).get(a, None)
    fe_example.encode_at(a, 0)

In [None]:
random.seed(120394016)
%matplotlib inline
# Evaluate the Deep Q-learning agent
dql_run = learner.epsilon_greedy_search(
    cyberbattle_gym_env=gym_env,
    environment_properties=ep,
    learner=dqla.DeepQLearnerPolicy(
        ep=ep,
        gamma=0.015,
        replay_memory_size=10000,
        target_update=10,
        batch_size=512,
        # torch default learning rate is 1e-2
        # a large value helps converge in less episodes
        learning_rate=0.01
    ),
    episode_count=training_episode_count,
    iteration_count=iteration_count,
    epsilon=0.90,
    epsilon_exponential_decay=5000,
    epsilon_minimum=0.10,
    verbosity=Verbosity.Quiet,
    render=True,
    plot_episodes_length=True,
    title="DQL"
)

In [None]:
random.seed(120394016)
%matplotlib inline
dql_exploit_run = learner.epsilon_greedy_search(
    gym_env,
    ep,
    learner=dql_run['learner'],
    episode_count=eval_episode_count,
    iteration_count=iteration_count,
    epsilon=0.0,
    epsilon_minimum=0.00,
    render=True,
    plot_episodes_length=True,
    verbosity=Verbosity.Quiet,
    title="Exploiting DQL"
)

In [None]:
# Evaluate the Double Deep Q-learning agent
ddql_run = learner.epsilon_greedy_search(
    cyberbattle_gym_env=gym_env,
    environment_properties=ep,
    learner=ddqla.DeepQLearnerPolicy(
        ep=ep,
        gamma=0.015,
        replay_memory_size=10000,
        target_update=10,
        batch_size=512,
        # torch default learning rate is 1e-2
        # a large value helps converge in less episodes
        learning_rate=0.01
    ),
    episode_count=training_episode_count,
    iteration_count=iteration_count,
    epsilon=0.90,
    epsilon_exponential_decay=5000,
    epsilon_minimum=0.10,
    verbosity=Verbosity.Quiet,
    render=True,
    plot_episodes_length=True,
    title="DDQL"
)

In [None]:
random.seed(120394016)
%matplotlib inline
# Evaluate an agent that exploits the Double Q-function learnt above
ddql_exploit_run = learner.epsilon_greedy_search(
    gym_env,
    ep,
    learner=ddql_run['learner'],
    episode_count=eval_episode_count,
    iteration_count=iteration_count,
    epsilon=0.0,
    epsilon_minimum=0.00,
    render=True,
    plot_episodes_length=True,
    verbosity=Verbosity.Quiet,
    title="Exploiting DDQL"
)

In [None]:
random.seed(120394016)
%matplotlib inline
# Evaluate the Dueling Deep Q-learning agent
dueling_dql_run = learner.epsilon_greedy_search(
    cyberbattle_gym_env=gym_env,
    environment_properties=ep,
    learner=duelingdqla.DeepQLearnerPolicy(
        ep=ep,
        gamma=0.015,
        replay_memory_size=10000,
        target_update=10,
        batch_size=512,
        # torch default learning rate is 1e-2
        # a large value helps converge in less episodes
        learning_rate=0.01
    ),
    episode_count=training_episode_count,
    iteration_count=iteration_count,
    epsilon=0.90,
    epsilon_exponential_decay=5000,
    epsilon_minimum=0.10,
    verbosity=Verbosity.Quiet,
    render=True,
    plot_episodes_length=True,
    title="Dueling DQL"
)

In [None]:
random.seed(120394016)
%matplotlib inline
# Evaluate an agent that exploits the Dueling Q-function learnt above
dueling_dql_exploit_run = learner.epsilon_greedy_search(
    gym_env,
    ep,
    learner=dueling_dql_run['learner'],
    episode_count=eval_episode_count,
    iteration_count=iteration_count,
    epsilon=0.0,
    epsilon_minimum=0.00,
    render=True,
    plot_episodes_length=True,
    verbosity=Verbosity.Quiet,
    title="Exploiting Dueling DQL"
)


In [None]:
random.seed(120394016)
%matplotlib inline
# Evaluate the Dueling Double Deep Q-learning agent
dueling_ddql_run = learner.epsilon_greedy_search(
    cyberbattle_gym_env=gym_env,
    environment_properties=ep,
    learner=dueling_ddqla.DeepQLearnerPolicy(
        ep=ep,
        gamma=0.015,
        replay_memory_size=10000,
        target_update=10,
        batch_size=512,
        # torch default learning rate is 1e-2
        # a large value helps converge in less episodes
        learning_rate=0.01
    ),
    episode_count=training_episode_count,
    iteration_count=iteration_count,
    epsilon=0.90,
    epsilon_exponential_decay=5000,
    epsilon_minimum=0.10,
    verbosity=Verbosity.Quiet,
    render=True,
    plot_episodes_length=True,
    title="Dueling DDQL"
)

In [None]:
random.seed(120394016)
%matplotlib inline
# Evaluate an agent that exploits the  the Dueling Double Q-function learnt above
dueling_ddql_exploit_run = learner.epsilon_greedy_search(
    gym_env,
    ep,
    learner=dueling_ddql_run['learner'],
    episode_count=eval_episode_count,
    iteration_count=iteration_count,
    epsilon=0.0,
    epsilon_minimum=0.00,
    render=True,
    plot_episodes_length=True,
    verbosity=Verbosity.Quiet,
    title="Exploiting Dueling DDQL"
)

In [None]:
random.seed(120394016)
%matplotlib inline
# Evaluate the random agent
random_run = learner.epsilon_greedy_search(
    gym_env,
    ep,
    learner=learner.RandomPolicy(),
    episode_count=eval_episode_count,
    iteration_count=iteration_count,
    epsilon=1.0,  # purely random
    render=True,
    verbosity=Verbosity.Quiet,
    plot_episodes_length=True,
    title="Random search"
)

In [None]:
random.seed(120394016)
%matplotlib inline
# Evaluate a random agent that opportunistically exploits
# credentials gathere in its local cache
credlookup_run = learner.epsilon_greedy_search(
    gym_env,
    ep,
    learner=rca.CredentialCacheExploiter(),
    episode_count=10,
    iteration_count=iteration_count,
    epsilon=0.90,
    render=True,
    epsilon_exponential_decay=10000,
    epsilon_minimum=0.10,
    verbosity=Verbosity.Quiet,
    title="Credential lookups (ϵ-greedy)"
)

In [None]:
random.seed(120394016)
%matplotlib inline
# Evaluate a Tabular Q-learning agent
tabularq_run = learner.epsilon_greedy_search(
    gym_env,
    ep,
    learner=tqa.QTabularLearner(
        ep,
        gamma=0.015, learning_rate=0.01, exploit_percentile=100),
    episode_count=training_episode_count,
    iteration_count=iteration_count,
    epsilon=0.90,
    epsilon_exponential_decay=5000,
    epsilon_minimum=0.01,
    verbosity=Verbosity.Quiet,
    render=True,
    plot_episodes_length=True,
    title="Tabular Q-learning"
)

In [None]:
random.seed(120394016)
%matplotlib inline
# Evaluate an agent that exploits the Q-table learnt above
tabularq_exploit_run = learner.epsilon_greedy_search(
    gym_env,
    ep,
    learner=tqa.QTabularLearner(
        ep,
        trained=tabularq_run['learner'],
        gamma=0.0,
        learning_rate=0.0,
        exploit_percentile=90),
    episode_count=eval_episode_count,
    iteration_count=iteration_count,
    epsilon=0.0,
    render=True,
    verbosity=Verbosity.Quiet,
    title="Exploiting Q-matrix"
)


In [None]:
# Compare and plot results for all the agents
all_runs = [
     random_run,
     #credlookup_run,
     #tabularq_run,
     #tabularq_exploit_run,
     dql_run,
     dql_exploit_run,
         #ddql_run,
     #ddql_exploit_run
 ]

In [None]:
# Plot averaged cumulative rewards for DQL vs Random vs DQL-Exploit
themodel = dqla.CyberBattleStateActionModel(ep)
%matplotlib inline
p.plot_averaged_cummulative_rewards(
     all_runs=all_runs,
     title=f'Benchmark -- max_nodes={ep.maximum_node_count}, episodes={eval_episode_count},\n'
     f'State: {[f.name() for f in themodel.state_space.feature_selection]} '
     f'({len(themodel.state_space.feature_selection)}\n'
     f"Action: abstract_action ({themodel.action_space.flat_size()})")

In [None]:

 # Compare and plot results for all the agents
all_runs = [
    random_run,
 #     credlookup_run,
 #     tabularq_run,
 #     tabularq_exploit_run,
    dql_run,
    dql_exploit_run,
    ddql_run,
    ddql_exploit_run
]
#
 # Plot averaged cumulative rewards for DQL vs Random vs DQL-Exploit vs DDQL vs DDQL-Exploit
themodel = dqla.CyberBattleStateActionModel(ep)
 p.plot_averaged_cummulative_rewards(
     all_runs=all_runs,
     title=f'Benchmark -- max_nodes={ep.maximum_node_count}, episodes={eval_episode_count},\n'
     f'State: {[f.name() for f in themodel.state_space.feature_selection]} '
    f'({len(themodel.state_space.feature_selection)}\n'
     f"Action: abstract_action ({themodel.action_space.flat_size()})")

In [None]:
# Compare and plot results for all the agents
all_runs = [
     random_run,
 #     credlookup_run,
 #     tabularq_run,
 #     tabularq_exploit_run,
     dql_run,
     dql_exploit_run,
         ddql_run,
     ddql_exploit_run,
     dueling_dql_run,
     dueling_dql_exploit_run
]
#
 # Plot averaged cumulative rewards for DQL vs Random vs DQL-Exploit vs DDQL vs DDQL-Exploit vs Dueling DQL vs Dueling DQL-Exploit
themodel = dqla.CyberBattleStateActionModel(ep)
p.plot_averaged_cummulative_rewards(
     all_runs=all_runs,
     title=f'Benchmark -- max_nodes={ep.maximum_node_count}, episodes={eval_episode_count},\n'
     f'State: {[f.name() for f in themodel.state_space.feature_selection]} '
     f'({len(themodel.state_space.feature_selection)}\n'
     f"Action: abstract_action ({themodel.action_space.flat_size()})")

In [None]:
# Compare and plot results for all the agents
all_runs = [
    random_run,
 #     credlookup_run,
 #     tabularq_run,
 #     tabularq_exploit_run,
    dql_run,
    dql_exploit_run,
         ddql_run,
     ddql_exploit_run,
 #     dueling_dql_run,
 #     dueling_dql_exploit_run,
     dueling_ddql_run,
     dueling_ddql_exploit_run


]

 # Plot averaged cumulative rewards for DQL vs Random vs DQL-Exploit vs DDQL vs DDQL-Exploit vs Dueling DDQL vs Dueling DDQL-Exploit
themodel = dqla.CyberBattleStateActionModel(ep)
p.plot_averaged_cummulative_rewards(
     all_runs=all_runs,
     title=f'Benchmark -- max_nodes={ep.maximum_node_count}, episodes={eval_episode_count},\n'
     f'State: {[f.name() for f in themodel.state_space.feature_selection]} '
     f'({len(themodel.state_space.feature_selection)}\n'
     f"Action: abstract_action ({themodel.action_space.flat_size()})")


In [None]:
# Compare and plot results for all the agents
all_runs = [
    random_run,
    credlookup_run,
    tabularq_run,
    tabularq_exploit_run,
    dql_run,
    dql_exploit_run,
    ddql_run,
    ddql_exploit_run,
    dueling_dql_run,
    dueling_dql_exploit_run,
    dueling_ddql_run,
    dueling_ddql_exploit_run

]

 # Plot averaged cumulative rewards for DQL vs Random vs DQL-Exploit vs DDQL vs DDQL-Exploit vs Dueling DQL vs Dueling DQL-Exploit vs Dueling DDQL vs Dueling DDQL-Exploit
themodel = dqla.CyberBattleStateActionModel(ep)
p.plot_averaged_cummulative_rewards(
    all_runs=all_runs,
    title=f'Benchmark -- max_nodes={ep.maximum_node_count}, episodes={eval_episode_count},\n'
          f'State: {[f.name() for f in themodel.state_space.feature_selection]} '
          f'({len(themodel.state_space.feature_selection)}\n'
          f"Action: abstract_action ({themodel.action_space.flat_size()})")

In [None]:
# Compare and plot results for all the agents
all_runs = [
     random_run,
     credlookup_run,
     tabularq_run,
     tabularq_exploit_run,
     dql_run,
     dql_exploit_run
 ]
#
# Plot averaged cumulative rewards for DQL vs Random vs DQL-Exploit
themodel = dqla.CyberBattleStateActionModel(ep)
p.plot_averaged_cummulative_rewards(
    all_runs=all_runs,
     title=f'Benchmark -- max_nodes={ep.maximum_node_count}, episodes={eval_episode_count},\n'
     f'State: {[f.name() for f in themodel.state_space.feature_selection]} '
     f'({len(themodel.state_space.feature_selection)}\n'
     f"Action: abstract_action ({themodel.action_space.flat_size()})")

In [None]:
contenders = [
    random_run,
    credlookup_run,
    tabularq_run,
    tabularq_exploit_run,
    dql_run,
    dql_exploit_run,
    ddql_run,
    ddql_exploit_run,
    dueling_dql_run,
    dueling_dql_exploit_run,
    dueling_ddql_run,
    dueling_ddql_exploit_run
]
p.plot_episodes_length(contenders)
p.plot_averaged_cummulative_rewards(
    title=f'Agent Benchmark top contenders\n'
          f'max_nodes:{ep.maximum_node_count}\n',
    all_runs=contenders)

In [None]:
# Plot cumulative rewards for all episodes
for r in contenders:
    p.plot_all_episodes(r)

In [None]:
contenders2 = [
     random_run,
     #     credlookup_run,
     #     tabularq_run,
     #     tabularq_exploit_run,
     dql_run,
     # dql_exploit_run,
     ddql_run,
     # ddql_exploit_run,
     dueling_dql_run,
     # dueling_dql_exploit_run,
     dueling_ddql_run,
     # dueling_ddql_exploit_run
 ]

In [None]:
p.plot_episodes_length(contenders2)
p.plot_averaged_cummulative_rewards(
     title=f'Agent Benchmark top contenders\n'
           f'max_nodes:{ep.maximum_node_count}\n',
     all_runs=contenders2)

In [None]:
contenders3 = [
    # random_run,
     #     credlookup_run,
     #     tabularq_run,
     #     tabularq_exploit_run,
     # dql_run,
     dql_exploit_run,
     # ddql_run,
     ddql_exploit_run,
     # dueling_dql_run,
     dueling_dql_exploit_run,
     # dueling_ddql_run,
     dueling_ddql_exploit_run
 ]
p.plot_episodes_length(contenders3)
p.plot_averaged_cummulative_rewards(
     title=f'Agent Benchmark top contenders\n'
           f'max_nodes:{ep.maximum_node_count}\n',
     all_runs=contenders3)

In [None]:
contenders1 = [
    random_run,
    #     credlookup_run,
    #     tabularq_run,
    #     tabularq_exploit_run,
    dql_run,
    dql_exploit_run,
    ddql_run,
    ddql_exploit_run,
    dueling_dql_run,
    dueling_dql_exploit_run,
    dueling_ddql_run,
    dueling_ddql_exploit_run
]
p.plot_episodes_length(contenders1)
p.plot_averaged_cummulative_rewards(
    title=f'Agent Benchmark top contenders\n'
          f'max_nodes:{ep.maximum_node_count}\n',
    all_runs=contenders1)

In [None]:
# Plot cumulative rewards for all episodes
for r in contenders1:
    p.plot_all_episodes(r)

In [None]:
contenders2 = [
    random_run,
    credlookup_run,
    tabularq_run,
    #     tabularq_exploit_run,
    dql_run,
    # dql_exploit_run,
    ddql_run,
    # ddql_exploit_run,
    dueling_dql_run,
    # dueling_dql_exploit_run,
    dueling_ddql_run,
    # dueling_ddql_exploit_run
]
p.plot_episodes_length(contenders2)
p.plot_averaged_cummulative_rewards(
    title=f'Agent Benchmark top contenders\n'
          f'max_nodes:{ep.maximum_node_count}\n',
    all_runs=contenders2)

In [None]:
contenders3 = [
    random_run,
    #     credlookup_run,
    #     tabularq_run,
    tabularq_exploit_run,
    # dql_run,
    dql_exploit_run,
    # ddql_run,
    ddql_exploit_run,
    # dueling_dql_run,
    dueling_dql_exploit_run,
    # dueling_ddql_run,
    dueling_ddql_exploit_run
]
p.plot_episodes_length(contenders3)
p.plot_averaged_cummulative_rewards(
    title=f'Agent Benchmark top contenders\n'
          f'max_nodes:{ep.maximum_node_count}\n',
    all_runs=contenders3)

In [None]:
contenders3v2    = [
    # random_run,
    #     credlookup_run,
    #     tabularq_run,
    tabularq_exploit_run,
    # dql_run,
    dql_exploit_run,
    # ddql_run,
    ddql_exploit_run,
    # dueling_dql_run,
    dueling_dql_exploit_run,
    # dueling_ddql_run,
    dueling_ddql_exploit_run
]
p.plot_episodes_length(contenders3v2)
p.plot_averaged_cummulative_rewards(
    title=f'Agent Benchmark top contenders\n'
          f'max_nodes:{ep.maximum_node_count}\n',
    all_runs=contenders3v2)

In [None]:
contenders3v3   = [
    # random_run,
    #     credlookup_run,
    #     tabularq_run,
    # tabularq_exploit_run,
    # dql_run,
    dql_exploit_run,
    # ddql_run,
    ddql_exploit_run,
    # dueling_dql_run,
    dueling_dql_exploit_run,
    # dueling_ddql_run,
    dueling_ddql_exploit_run
]
p.plot_episodes_length(contenders3v3)
p.plot_averaged_cummulative_rewards(
    title=f'Agent Benchmark top contenders\n'
          f'max_nodes:{ep.maximum_node_count}\n',
    all_runs=contenders3v3)

In [None]:
contenders4 = [
    # random_run,
    credlookup_run,
    tabularq_run,
    # tabularq_exploit_run,
    # dql_run,
    # dql_exploit_run,
    ddql_run,
    # ddql_exploit_run,
    dueling_dql_run,
    # dueling_dql_exploit_run,
    dueling_ddql_run,
    # dueling_ddql_exploit_run
]
p.plot_episodes_length(contenders4)
p.plot_averaged_cummulative_rewards(
    title=f'Agent Benchmark top contenders\n'
          f'max_nodes:{ep.maximum_node_count}\n',
    all_runs=contenders4)

In [None]:
contenders5 = [
    # random_run,
    #     credlookup_run,
    #     tabularq_run,
    tabularq_exploit_run,
    # dql_run,
    # dql_exploit_run,
    # ddql_run,
    ddql_exploit_run,
    # dueling_dql_run,
    dueling_dql_exploit_run,
    # dueling_ddql_run,
    dueling_ddql_exploit_run
]
p.plot_episodes_length(contenders5)
p.plot_averaged_cummulative_rewards(
    title=f'Agent Benchmark top contenders\n'
          f'max_nodes:{ep.maximum_node_count}\n',
    all_runs=contenders5)