In [1]:
import os
import gym
import numpy as np
import keras
from keras.models import Model, load_model
from keras.layers import Input, Dense, Lambda, Add
from tensorflow.keras.optimizers import Adam
from keras import backend as kb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense
import torch
import gym
import torch.optim as optim
import torch.nn as nn
import matplotlib.pyplot as plt
import pandas as pd

In [2]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class AgentBuild:
    #Learning Parameters specifically tailored to PPO hyperparameters
    learning_rate = 0.01 #The learning rate of the optimizer
    gamma = 0.99 #Determines how important future rewards are to current state 0.99 is most common value
    plot_freq = 5 #The frequency with which the model will be plotted
    update_freq = 1 #The frequency with which the model will be updated
    num_epoch = 5 #The number of times the model will be trained using the entire dataset
    clip_range = 0.2 #Clipping range of the agent 0.1 - 0.3 most common clipping ranges
    lmbda = 0.9 #Used to reduce varience in training 0.9 - 1 is the most common range of values
    v_coef = 1 #Value function coefficient most common values 0.5 or 1
    e_coef = 0.01 #Entropy coefficient most common values range from 0 - 0.01
    
    #Memory Parameters
    mem_size = 500
    train_if_true = True 

In [4]:

class MlpPolicy(nn.Module):
    def __init__(self, action_size):
        super(MlpPolicy, self).__init__()
        self.action_size = action_size
        self.input_size = 56
        self.fc1 = nn.Linear(self.input_size, 24)
        self.fc2 = nn.Linear(24, 24)
        self.fc3_pi = nn.Linear(24, self.action_size)
        self.fc3_v = nn.Linear(24, 1)
        self.tanh = nn.Tanh()
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()
        
    # Policy Function
    def pi(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3_pi(x)
        return self.softmax(x)
        
    # Value Function
    def val(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3_v(x)
        return x
        

In [21]:
class NASimAgent(AgentBuild):
    def __init__(self):
        self.env = gym.make('nasim:Tiny-v0')
        self.action_size = self.env.action_space.n
        if self.train_if_true:
            self.policy = MlpPolicy(action_size = self.action_size).to(device)
        self.opt = optim.Adam(self.policy.parameters(), lr = self.learning_rate)
        self.scheduler = optim.lr_scheduler.StepLR(self.opt, self.num_epoch, self.gamma)
        self.loss = 0
        self.memory = {
            'state' : [],
            'action' : [],
            'next_state' : [],
            'reward' : [],
            'action_prob' : [],
            'advantage' : [],
            'target' : torch.FloatTensor([]),
            'count' : 0
        }
        
    def update_network(self):
            x = 0
    
    # Training function for agent
    def train(self):
        episode = 0
        step = 0
        rewards = []
        avg_reward = []
        done = False
        
        # Starting a new episode
        while not done:
            starting_step = step
            episode += 1
            length_of_episode = 0
            
            #Setup initial state of enviornment
            state = self.env.reset()
            curr_state = state
            
            # Step in episode
            while not done:
                self.env.render_state()
                step += 1
                length_of_episode += 1
                episode_reward = 0
                
                # Selection of next action
                action_prob = self.policy.pi(torch.FloatTensor(curr_state).to(device))
                action = torch.distributions.Categorical(action_prob).sample().item()
                
                # Performing the selected action in the current state
                temp_state, reward, done, info = self.env.step(action)
                next_state = temp_state
                self.rem(curr_state, 
                                action, 
                                reward,
                                next_state,
                                action_prob[action].item())
                curr_state = next_state
                episode_reward += reward
                # Updating model based on episode number
                if episode % self.update_freq == 0:
                    for info in range(self.num_epoch):
                        self.update_network()
                # Updating plot based on episode number        
                if episode % plot_freq == 0:
                    plot(reward_history, avg_reward)
            self.env.close()
            
    # Function to plot the score against the episode number
    def plot(reward_history, avg_reward):
        df = pd.DataFrame({'x' : range(len(reward_history)),
                           'Reward' : reward_history,
                           'Average' : avg_reward})
        plt.style.use('ggplot')
        plt.plot(df['x'],
                 df['Reward'], 
                 marker='',
                 linewidth=0.7, alpha=0.9,
                 label='Reward')
        plt.title("NASim Score vs Number of Episodes Plot", fontsize=12)
        plt.xlabel("episode", fontsize=12)
        plt.ylabel("score", fontsize=12)
        plt.savefig('SimScores.png')
    
    #remember the state of agent
    def rem(self, state, action, reward, next_state, prob):
        if self.memory['count'] < self.mem_size:
            self.memory['count'] += 1
        else:
            self.memory['state'] = self.memory['state'][1:]
            self.memory['action'] = self.memory['action'][1:]
            self.memory['reward'] = self.memory['reward'][1:]
            self.memory['next_state'] = self.memory['next_state'][1:]
            self.memory['action_prob'] = self.memory['action_prob'][1:]
            self.memory['advantage'] = self.memory['advantage'][1:]

        self.memory['state'].append(state)
        self.memory['action'].append([action])
        self.memory['reward'].append([reward])
        self.memory['next_state'].append(next_state)
        self.memory['action_prob'].append(prob)


In [22]:
def run_simulation():
    A2CAgent = NASimAgent()
    A2CAgent.train()

Run Simulation


In [23]:
if __name__ == '__main__':
    run_simulation()

State:
+---------+-------------+-----------+------------+-------+-----------------+--------+-------+------+--------+
| Address | Compromised | Reachable | Discovered | Value | Discovery Value | Access | linux | ssh  | tomcat |
+---------+-------------+-----------+------------+-------+-----------------+--------+-------+------+--------+
|  (1, 0) |    False    |    True   |    True    |  0.0  |       0.0       |  0.0   |  True | True |  True  |
|  (2, 0) |    False    |   False   |   False    | 100.0 |       0.0       |  0.0   |  True | True |  True  |
|  (3, 0) |    False    |   False   |   False    | 100.0 |       0.0       |  0.0   |  True | True |  True  |
+---------+-------------+-----------+------------+-------+-----------------+--------+-------+------+--------+


  return self.softmax(x)


NameError: name 'plot_freq' is not defined