# SGAI models (DQN)

This notebook is based off of the pytorch tutorial [here](https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html). It is intended to both create and train models for Courtney2-Outbreak

### Training Environments

In [1]:
import sys
import numpy as np
import tensorflow as tf
sys.path.append("./")  # make sure that it is able to import Board

from Board import Board
from constants import *
from Player import ZombiePlayer, GovernmentPlayer

In [2]:
DEVICE = "GPU"
#tf.debugging.set_log_device_placement(True)
devices = tf.config.list_physical_devices(DEVICE)
print(devices)
tf.config.experimental.set_memory_growth(devices[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [22]:
class ZombieEnvironment:
    ACTION_SPACE = tuple(range(8))
    ACTION_MAPPINGS = {
        0: "moveUp",
        1: "moveDown",
        2: "moveLeft",
        3: "moveRight",
        4: "biteUp",
        5: "biteDown",
        6: "biteLeft",
        7: "biteRight",
    }
    SIZE = (6, 6)

    def __init__(self) -> None:
        self.reset()

    def reset(self):
        self.board = Board(ZombieEnvironment.SIZE, "Zombie")
        self.board.populate(num_zombies=1)
        self.enemyPlayer = GovernmentPlayer()
        self.done = False

        # coordinates of the first zombie
        self.agentPosition = self.board.indexOf(True)
        self.max_number_of_zombies = 1

        return self._get_obs()

    def step(self, action:int):
        action_name = ZombieEnvironment.ACTION_MAPPINGS[action]
        if "move" in action_name:
            valid, new_pos = self.board.actionToFunction[action_name](
                self.board.toCoord(self.agentPosition)
            )
            if valid:
                self.agentPosition = new_pos
        else:  # bite variation
            dest_coord = list(self.board.toCoord(self.agentPosition))
            if action_name == "biteUp":
                dest_coord[1] -= 1
            elif action_name == "biteDown":
                dest_coord[1] += 1
            elif action_name == "biteRight":
                dest_coord[0] +=1
            else:
                dest_coord[0] -= 1
            valid, _ = self.board.actionToFunction["bite"](dest_coord)

        won = None
        if valid:
            # currently, if the computer chooses an invalid move, this doesn't
            # end the environment. This can be changed.
            _action, coord = self.enemyPlayer.get_move(self.board)
            if not _action:
                self.done = True
                won = True
            else:    
                self.board.actionToFunction[_action](coord)
            self.board.update()

        if not self.board.States[self.agentPosition].person.isZombie:  # zombie was cured
            self.done = True
            won = False

        self.max_number_of_zombies = max(self.board.num_zombies(), self.max_number_of_zombies)

        return (
            self._get_obs(),
            self._get_reward(action_name, valid, won),
            self._get_done(),
            self._get_info(),
        )

    def _get_info(self):
        return {}

    def _get_done(self):
        return self.done

    def _get_reward(self, action_name: str, was_valid: bool, won:bool):
        if not was_valid:
            return -100
        if won is True:
            return 100
        if won is False:
            return -100
        if "bite" in action_name:
            return 15
        return 1  # this is the case where it was move

    def _get_obs(self):
        return np.array(self.board.get_board())

    def render(self):
        import PygameFunctions as PF
        import pygame
        PF.run(self.board)
        pygame.display.update()

    def init_render(self):
        import PygameFunctions as PF
        import pygame
        PF.initScreen(self.board)
        pygame.display.update()

    def close(self):
        import pygame
        pygame.quit()



In [4]:
import time
arr = tf.constant([1])
env = ZombieEnvironment()

print(env.reset())
env.init_render()
env.render()
time.sleep(0.2)
print(env.step(arr.numpy()[0]))
env.render()
env.close()

[2 3 1 3 3 3 1 1 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
pygame 2.1.0 (SDL 2.0.16, Python 3.10.0)
Hello from the pygame community. https://www.pygame.org/contribute.html
(array([2, 3, 1, 3, 3, 3, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), -100, False, {})


### Make models

In [5]:
import keras.layers as layers
import keras.models as models
import keras

ZOMBIE_OUTPUT_SIZE = len(ZombieEnvironment.ACTION_SPACE)
INPUT_SHAPE = (ROWS * COLUMNS,)


In [6]:
# make sure that it is using the correct device
with tf.device(DEVICE):
    tf.random.normal((200, 3))


In [7]:
def make_zombie_model():
    """
    makes the model that will be used for zombies
    The output of the model will be the predicted q value
    for being in a certain state.
    """
    model = models.Sequential()
    model.add(layers.InputLayer(INPUT_SHAPE))
    model.add(layers.Flatten())
    model.add(layers.Dense(64))
    model.add(layers.LeakyReLU())
    model.add(layers.Dense(128))  # 120 is arbitrary number
    model.add(layers.LeakyReLU())
    model.add(layers.Dense(ZOMBIE_OUTPUT_SIZE))
    return model


In [8]:
with tf.device(DEVICE):
    zombie_policy = make_zombie_model()
    zombie_target = make_zombie_model()


In [9]:
print(zombie_policy.input_shape)
zombie_policy.summary()


(None, 36)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 36)                0         
                                                                 
 dense (Dense)               (None, 64)                2368      
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 128)               8320      
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 8)                 1032      
                                                                 
Total params: 11,720
Trainable params: 11,720

In [10]:
with tf.device(DEVICE):
    temp = zombie_policy(tf.random.normal((1, 36)), training=False)
print(temp.shape)


(1, 8)


### DQN utilities

In [11]:
from collections import namedtuple
from queue import deque
import random


In [12]:
# this acts as a class; useful in the training
Transition = namedtuple("Transition", ("state", "action", "next_state", "reward"))


class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)


### Optimizers and Loss

In [13]:
with tf.device(DEVICE):
    optimizer = keras.optimizers.Adam(0.004)
    loss = keras.losses.MeanSquaredError()


### Training loop

In [14]:
import math

BATCH_SIZE = 128
GAMMA = 0.999
EPSILON = 0.05  # exploration rate
TARGET_UPDATE = 10


In [15]:
def select_zombie_action(state):
    sample = random.random()
    if sample > EPSILON:
        # t.max(1) will return largest column value of each row.
        # second column on max result is index of where max element was
        # found, so we pick action with the larger expected reward.
        temp = zombie_policy(state, training=False)
        numpy = temp.numpy().flatten()
        return tf.constant([tuple(numpy).index(max(numpy))])
    else:
        return tf.constant([random.randrange(ZOMBIE_OUTPUT_SIZE)], dtype=tf.int32)


In [16]:
from typing import List


def train_on_batch(_batch: List[Transition]):
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*_batch))

    # compute the states that aren't terminal states
    non_final_mask = tf.constant(
        tuple(map(lambda state: state is not None, batch.next_state)), dtype=tf.bool
    )
    non_final_next_states = tf.constant([state for state in batch.next_state if state is not None])

    state_batch = tf.constant(batch.state)
    action_batch = tf.constant(batch.action)
    reward_batch = tf.constant(batch.reward, dtype=tf.double)

    with tf.GradientTape() as policy_tape:
        state_action_values = zombie_policy(state_batch, training=True)
        state_action_values = tf.reduce_max(state_action_values, 1)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = np.zeros(BATCH_SIZE)
        next_state_values[non_final_mask.numpy()] = zombie_target(
            non_final_next_states, training=False
        ).numpy().max()
        next_state_values = tf.constant(next_state_values)

        # Compute the expected Q values
        expected_state_action_values = (next_state_values * GAMMA) + reward_batch
        expected_state_action_values = tf.expand_dims(expected_state_action_values, 1)  # equivalent of unsqueeze

        # compute loss (mean squared error)
        _loss = loss(state_action_values, expected_state_action_values)

    # Optimize the model
    policy_gradient = policy_tape.gradient(_loss, zombie_policy.trainable_variables)
    optimizer.apply_gradients(zip(policy_gradient, zombie_policy.trainable_variables))


In [17]:
BUFFER_CAPACITY = 10000
memory = ReplayMemory(BUFFER_CAPACITY)


def train(epochs, max_timesteps=200):
    env = ZombieEnvironment()
    env.init_render()
    for i_episode in range(epochs):
        # Initialize the environment and state
        prev_obs = env.reset()
        done = False
        timesteps = 0
        while not done:
            timesteps += 1
            env.render()
            # Select and perform an action
            action = select_zombie_action(tf.constant([prev_obs]))
            action = action.numpy()[0]  # "flatten" the tensor and take the item
            new_obs, reward, done, _ = env.step(action)
            # reward = tf.constant([reward])

            # Observe new state
            if not done:
                next_state = new_obs
            else:
                next_state = None

            # Store the transition in memory
            memory.push(prev_obs, action, next_state, reward)

            # Move to the next state
            prev_obs = next_state

            # Perform one step of the optimization (on the policy network)
            if len(memory) >= BATCH_SIZE:
                train_on_batch(memory.sample(BATCH_SIZE))

            if timesteps > max_timesteps:
                continue

        print("next episode")
        # Update the target network, copying all weights and biases in DQN
        if i_episode % TARGET_UPDATE == 0:
            zombie_policy.save_weights("zombie_policy_weights")
            # zombie_target.load_state_dict(zombie_policy.state_dict())
            zombie_target.load_weights("./zombie_policy_weights")
    env.close()


### Start Training!

In [23]:
with tf.device(DEVICE):
    train(50, 300)

### Metrics

In [None]:
# metric evaluate how well its doing
