In [None]:
!pip install -e ../../gym_k8s_real

In [None]:
import gym
import random
import subprocess
import time
import numpy as np
from threading import Lock, Thread
import datetime
import gym_k8s_real

## Packages to build DQN

In [None]:
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import random

## Define DQN class

In [2]:
 class DQNAgent():
    def __init__(self, env, path, episodes, max_env_steps, win_threshold, epsilon_decay,
                 state_size=None, action_size=None, epsilon=1.0, epsilon_min=0.01, 
                 gamma=1, alpha=.01, alpha_decay=.01, batch_size=16, prints=False):
        self.memory = deque(maxlen=100000)
        self.env = env
 
        if state_size is None: 
            self.state_size = self.env.observation_space.n 
        else: 
            self.state_size = state_size
 
        if action_size is None: 
            self.action_size = self.env.action_space.n 
        else: 
            self.action_size = action_size
 
        self.episodes = episodes
        self.env._max_episode_steps = max_env_steps
        self.win_threshold = win_threshold
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.gamma = gamma
        self.alpha = alpha
        self.alpha_decay = alpha_decay
        self.batch_size = batch_size
        self.path = path                     #location where the model is saved to
        self.prints = prints                 #if true, the agent will print his scores
 
        self.model = self._build_model()
    
    #Build network model
    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='tanh'))
        model.add(Dense(48, activation='tanh'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.alpha, decay=self.alpha_decay))
        return model
    
    #Generate one action
    def generate_action(self, state, eps, hAction):
        # epsilon-greey to take best action from action-value function
        if np.random.random() < eps:
            return self.env.action_space.sample()
        # select action with herustic-boosted method
        hVal = self.hValue(state, hAction)
        maxAction = -1
        maxQ = -1
        for action in range(len(self.target_model.predict(state)[0])):
            aggreVal = self.target_model.predict(state)[0][action] + hVal if action == hAction else self.target_model.predict(new_state)[0][action] + 0
            if maxQ < aggreVal:
                maxQ = aggreVal
                maxAction = action
        return maxAction
    
    #Add states into memory
    def remember(self, state, action, reward, next_state, done): 
        self.memory.append((state, action, reward, next_state, done))
    
    #Replay memory to train
    def replay(self, batch_size):
        x_batch, y_batch = [], []
        minibatch = random.sample(
            self.memory, min(len(self.memory), batch_size))
        for state, action, reward, next_state, done in minibatch:
            y_target = self.model.predict(state)
            y_target[0][action] = reward if done else reward + self.gamma * np.max(self.model.predict(next_state)[0])
            x_batch.append(state[0])
            y_batch.append(y_target[0])

        self.model.fit(np.array(x_batch), np.array(y_batch), batch_size=len(x_batch), verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    #Define calculate heuristic value; input: action from heuristic policy
    #hAction is returned from gym environment and is encoded 
    def hValue(self, state, hAction):
        #Q-value where action is equal to heuristic action
        heurQ = self.target_model.predict(state)[0][hAction]
        hVal = np.amax(self.target_model.predict(state)[0])
        return hVal - heurQ
        
    #Actual training process
    def learning(self, hAction):
        self.t = (self.t + 1) % self.C
        
        # update every C times and make sure buffer is filled with at least size batch size
        if self.t == 0:
            if len(self.replay_buffer) < self.batch_size: 
                return
            
            # init list states to store states 
            # init list of targets values forecast gernated by model Q associated with each state-action
            states, targets_forecast = [], []
            
            # random sample from replay buffer
            samples = random.sample(self.replay_buffer, self.batch_size)
            
            for state, action, reward, new_state, done in samples:
                if done:
                    target = reward
                else:
                    Q_new_state =  np.amax(self.target_model.predict(new_state)[0])
                    target = reward + self.gamma *  Q_new_state

                target_forecast = self.model.predict(state)
                target_forecast[0][action] = target
                
                # append to lists for batch processing outside the iteartion
                states.append(state[0])
                targets_forecast.append(target_forecast[0])
            
            # batch learning to train the model Q   
            self.model.fit(np.array(states), np.array(targets_forecast), epochs=1, verbose=0)
            self.train_target()
            
    # soft update to target model Q_hat from model Q
    def train_target(self):
        # target model and model are not updating at the same time
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = self.tau * weights[i] + (1 - self.tau) * target_weights[i]
        # assign new weights to target model
        self.target_model.set_weights(target_weights)
    
    #Save model
    def save_model(self, name='DQN_Model'):
        self.model.save(self.path + name)
    
    

## Info about the kubernetes environment we deploy

In [2]:
# Timestep duration in minutes
# We wait these many minutes for our actions to be enforced
timestep_duration = 1.5
app_name = 'firewall'
memory_req = '128Mi'
cpu_req = '80m'
sla_latency = 2.6
sla_host = 'http://145.100.135.89:6088/'
# latency metric
sla_metric_name = 'latency'
gym_env = 'gym_k8s_real:k8s-env-discrete-state-discrete-action-v2'
#init Q_table
q_table_file = 'Q-env-discrete-state-discrete-action-data.npy'
q_table_init_value = 12.5
total_epochs = 100
num_of_services = 1
steps_per_epoch = 15

## Create historical states csv file if it doesn't exist

In [4]:
try:
    open('k8s_historical_states.csv', 'r').close()
    print('File already present.')
except IOError:
    with open('k8s_historical_states.csv', 'w') as f:
        f.write('current_app_name,timestep,state,action,next_state,reward,'
                'done,number_of_pods,cpu_util,latency_violation,latency,hpa_threshold,info\n')
    print('File not present. Created successfully!')

File not present. Created successfully!


# Train the agent

## Agent training
This function trains our agent:

In [None]:
def train_agent(num_service):
    # Hyperparameters
    alpha = 0.1
    gamma = 0.9
    epsilon_init = 0.97
    epsilon_min = 0.2

    current_app_name = app_name
    current_throughput_metric_name = prometheus_throughput_metric_name
    
    env = gym.make(
        gym_env,
        timestep_duration=timestep_duration,
        app_name=current_app_name,
        sla_throughput=sla_throughput,
        prometheus_host=prometheus_host,
        prometheus_throughput_metric_name=current_throughput_metric_name
    )

    agent = DQNAgent(env, "", episodes, max_env_steps, win_threshold, epsilon_decay,
                 state_size=None, action_size=None, epsilon=1.0, epsilon_min=0.01, 
                 gamma=1, alpha=.01, alpha_decay=.01, batch_size=16, prints=False)
    
    for epoch in range(0, total_epochs):
        state, _, hAction = env.reset()
#         state, _ = env.reset()
#         decoded_state = list(decode(state))
#         print('======EPOCH{}=======\n'
#               'Training started with cpu utilization: {}, hpa cpu threshold: {}, number of pods: {}, latency:{}'
#              .format(epoch, decoded_state[0], decoded_state[1], decoded_state[2], decoded_state[3]))
     
        done = False
        

        for step in range(steps_per_epoch):
            current_timestep = epoch * steps_per_epoch + step
            q_table = np.load(q_table_file)
            
            # Epsilon keeps getting smaller and stops when it reaches epsilon_min
            current_epsilon = pow(epsilon_init, current_timestep)
            epsilon = max(current_epsilon, epsilon_min)
            
            if done:
                break
            else: 
                action = agent.generate_action(state, epsilon)
                
#                 decoded_state = list(decode(state))
#                 print('======ROUND{}=======\n'
#                       'app: {}, pod_cpu_util: {}, cpu_threshold: {}, number_of_pods: {}, latency: {}, latency_violation: {}, action: {}'
#                       .format(step, current_app_name, decoded_state[0], decoded_state[1], decoded_state[2], 
#                               decoded_state[3], int(decoded_state[3] >= 5), action))
                
                real_ob, reward, done, next_state, hAction = env.step(action)
#                 real_ob, reward, done, next_state = env.step(action)
                
                now = datetime.datetime.now() + datetime.timedelta(hours=2)
                dt_string = now.strftime('%d/%m/%Y %H:%M:%S')
                dt_dict = {
                    'datetime': dt_string
                }
                info = dt_dict
                
                #To-do: change real_ob
                (pod_cpu_util,
                 cpu_threshold,
                 number_of_pods,
                 latency) = real_ob

                # Latency violation becomes 1 if the SLA was violated
                # otherwise it's 0
                latency_violation = int(latency > sla_latency)
                

                # Save historical tuple
                with open('k8s_historical_states_discrete.csv', 'a') as f:
                    f.write(
                        '{},{},{},{},'.format(current_app_name, current_timestep, state, action) +
                        '{},{},{},{},'.format(next_state, reward, done, number_of_pods) +
                        '{},{},{},{},{}'.format(pod_cpu_util, throughput_violation, throughput, cpu_threshold, info) +
                        '\n'
                    )
                    
                #Update Q-value
                agent.remember(state, action, reward, next_state, done): 
                
                #Try one round of training
                agent.learning()
                
                #To-do: Update H-value
            
                #Update state
                state = next_state
                
        agent.save_model()
        print('One epoch of training finished.\n')

In [5]:
def decode(i):
        out = []
        out.append(i % 7)
        i //= 7
        out.append(i % 5)
        i //= 5
        out.append(i % 5)
        i //= 5
        out.append(i)
        return reversed(out)

In [None]:
train_agent(num_of_services)