# SGAI models (DQN)

This notebook is based off of the pytorch tutorial [here](https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html). It is intended to both create and train models for Courtney2-Outbreak

### Training Environments

In [1]:
import sys
import numpy
sys.path.append("./")  # make sure that it is able to import Board

from Board import Board
from constants import *
from Player import ZombiePlayer, GovernmentPlayer

In [13]:
class ZombieEnvironment:
    ACTION_SPACE = tuple(range(8))
    ACTION_MAPPINGS = {
        0: "moveUp",
        1: "moveDown",
        2: "moveLeft",
        3: "moveRight",
        4: "biteUp",
        5: "biteDown",
        6: "biteLeft",
        7: "biteRight",
    }
    SIZE = (6, 6)

    def __init__(self) -> None:
        self.reset()

    def reset(self):
        self.board = Board(ZombieEnvironment.SIZE, "Zombie")
        self.board.populate(num_zombies=1)
        self.enemyPlayer = GovernmentPlayer()
        self.done = False

        # coordinates of the first zombie
        self.agentPosition = self.board.coordsOf(True)

        return self._get_obs()

    def step(self, action):
        action_name = ZombieEnvironment.ACTION_MAPPINGS[action.item()]
        if "move" in action_name:
            valid, new_pos = self.board.actionToFunction[action_name](
                self.agentPosition
            )
            if valid:
                self.agentPosition = new_pos
        else:  # bite variation
            dest_coord = list(self.agentPosition)
            if action_name == "biteUp":
                dest_coord[1] -= 1
            elif action_name == "biteDown":
                dest_coord[1] += 1
            elif action_name == "biteRight":
                dest_coord[0] +=1
            else:
                dest_coord[0] -= 1
            valid, _ = self.board.actionToFunction["bite"](dest_coord)

        # currently, if the computer chooses an invalid move, this doesn't
        # end the environment. This can be changed.
        _action, coord = self.enemyPlayer.get_move(self.board)
        if not _action:
            self.done = True
        self.board.update()

        return (
            self._get_obs(),
            self._get_reward(action_name, valid),
            self._get_done(),
            self._get_info(),
        )

    def _get_info(self):
        return {}

    def _get_done(self):
        return self.done

    def _get_reward(self, action_name: str, was_valid: bool):
        if not was_valid:
            return -100

    def _get_obs(self):
        return numpy.array(self.board.get_board())

    def render(self):
        import PygameFunctions as PF
        import pygame
        PF.run(self.board)
        pygame.display.update()

    def init_render(self):
        import PygameFunctions as PF
        import pygame
        PF.initScreen(self.board)
        pygame.display.update()

    def close(self):
        import pygame
        pygame.quit()



In [14]:
import numpy
import time
arr = numpy.ndarray((1,), dtype=numpy.int64)
arr.itemset(5)
env = ZombieEnvironment()

print(env.reset())
env.init_render()
env.render()
time.sleep(0.2)
print(env.step(arr))
env.render()
env.close()

[0 0 0 1 0 3 0 1 0 1 1 1 0 0 0 0 1 2 1 0 1 1 0 3 1 0 0 0 0 3 0 0 0 0 0 3]
possible actions are ['moveUp', 'moveDown', 'moveLeft', 'moveRight', 'heal']
possible actions are ['moveUp', 'moveDown', 'moveRight', 'heal']
possible actions are ['moveUp', 'moveDown', 'moveRight']
choosing to go with moveDown at (5, 2)
(array([0, 0, 0, 1, 0, 3, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 2, 1, 0, 1, 1,
       0, 3, 1, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3]), -100, False, {})


### Make models

In [4]:
import tensorflow as tf
import keras.layers as layers
import keras.models as models
import keras
import numpy as np

ZOMBIE_OUTPUT_SIZE = len(ZombieEnvironment.ACTION_SPACE)
INPUT_SHAPE = (ROWS * COLUMNS,)
DEVICE = "CPU"


In [5]:
tf.debugging.set_log_device_placement(True)


In [6]:
with tf.device("CPU"):
    tf.random.normal((200, 3))


Executing op RandomStandardNormal in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AddV2 in device /job:localhost/replica:0/task:0/device:CPU:0


In [7]:
def make_zombie_model():
    """
    makes the model that will be used for zombies
    The output of the model will be the predicted q value
    for being in a certain state.
    """
    model = models.Sequential()
    model.add(layers.InputLayer(INPUT_SHAPE))
    model.add(layers.Flatten())
    model.add(layers.Dense(64))
    model.add(layers.LeakyReLU())
    model.add(layers.Dense(128))  # 120 is arbitrary number
    model.add(layers.LeakyReLU())
    model.add(layers.Dense(ZOMBIE_OUTPUT_SIZE))
    return model


In [8]:
with tf.device(DEVICE):
    zombie_policy = make_zombie_model()
    zombie_target = make_zombie_model()


Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RandomUniform in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Sub in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AddV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Fill in device /job:localho

In [9]:
print(zombie_policy.input_shape)
zombie_policy.summary()


(None, 36)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 36)                0         
                                                                 
 dense (Dense)               (None, 64)                2368      
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 128)               8320      
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 5)                 645       
                                                                 
Total params: 11,333
Trainable params: 11,333

In [10]:
with tf.device(DEVICE):
    temp = zombie_policy(tf.random.normal((1, 36)), training=False)
print(temp.shape)


Executing op RandomStandardNormal in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AddV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Reshape in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MatMul in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op BiasAdd in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op LeakyRelu in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MatMul in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op BiasAdd in device /job:localhost/replica

### DQN utilities

In [11]:
from collections import namedtuple
from queue import deque
import random


In [12]:
# this acts as a class; useful in the training
Transition = namedtuple("Transition", ("state", "action", "next_state", "reward"))


class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)


### Optimizers and Loss

In [14]:
with tf.device(DEVICE):
    optimizer = keras.optimizers.Adam(0.004)
    loss = keras.losses.MeanSquaredError()


### Training loop

In [13]:
import math

BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10


In [None]:
def select_zombie_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(
        -1.0 * steps_done / EPS_DECAY
    )
    steps_done += 1
    if sample > eps_threshold:
        # t.max(1) will return largest column value of each row.
        # second column on max result is index of where max element was
        # found, so we pick action with the larger expected reward.
        temp = zombie_policy(state, training=False)
        return temp.max(1)[1].view(1, 1)
    else:
        return tf.constant([[random.randrange(ZOMBIE_OUTPUT_SIZE)]], dtype=tf.long)


In [15]:
from typing import List


def train_on_batch(_batch: List[Transition]):
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*_batch))

    # compute the states that aren't terminal states
    non_final_mask = tf.constant(
        tuple(map(lambda state: state is not None)), dtype=tf.bool
    )
    non_final_next_states = tf.constant([state for state in batch.next_state])

    state_batch = tf.constant(batch.state)
    action_batch = tf.constant(batch.action)
    reward_batch = tf.constant(batch.reward)

    with tf.GradientTape() as policy_tape:
        state_action_values = zombie_policy(state_batch, training=True)

    next_state_values = tf.zeros(BATCH_SIZE, device=DEVICE)
    next_state_values[non_final_mask] = zombie_target(
        non_final_next_states, training=False
    ).max(1)[0]

    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # compute loss (mean squared error)
    _loss = loss(state_action_values, expected_state_action_values)

    # Optimize the model
    policy_gradient = policy_tape.gradient(_loss, zombie_policy.trainable_variables)
    optimizer.apply_gradients(zip(policy_gradient, zombie_policy.trainable_variables))


In [None]:
BUFFER_CAPACITY = 10000
memory = ReplayMemory()


def train(epochs):
    for i_episode in range(epochs):
        # Initialize the environment and state
        observation = env.reset()
        done = False
        while not done:
            # Select and perform an action
            action = select_action(observation)
            _, reward, done, _ = env.step(action.item())
            reward = tf.constant([reward])

            # Observe new state
            last_screen = current_screen
            current_screen = get_screen()
            if not done:
                next_state = current_screen - last_screen
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the policy network)
            if len(memory) >= BATCH_SIZE:
                train_on_batch(memory.sample(BATCH_SIZE))

        # Update the target network, copying all weights and biases in DQN
        if i_episode % TARGET_UPDATE == 0:
            zombie_target.load_state_dict(
                zombie_policy.state_dict()
            )  # TODO change this to TF
