# N-Step Forest Fire DQN

## Set Up

### Colab Set Up

In [1]:
# ------------- Colab Set Up ------------

WORKING_DIRECTORY = '/gdrive/My Drive/CARL/RUNS2/06_21_9x9'

# Mounting Drive at root
from google.colab import drive
drive.mount('/gdrive')
import os
os.chdir(WORKING_DIRECTORY)
# ! pip install seaborn

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


### Libraries

In [2]:
# ------------- Libraries ------------

import os
import shutil
import pickle
import datetime

from helicopter import EnvMakerForestFire
from lib import dqn_model
from lib.helpers import Agent, ReplayMemoryNSteps, get_epsilon, observations_to_tensors

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

SEED = 125750
np.random.seed(SEED)
torch.manual_seed(SEED)

  import pandas.util.testing as tm


<torch._C.Generator at 0x7f3c5ac41a90>

## Algorithm Parameters

### Running Params

In [3]:
# ------------- Running Params -------------

# Iterations of this particular run
JOB_ITERS = int(1.6e6)
# {'cpu', 'cuda'}, choose 'cuda' if hardware acceleration present
# DEVICE = torch.device('cpu')
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'Using {DEVICE}')

Using cuda:0


### Logging Params

In [4]:
# ------------- Logging Params ------------
# Printing mean reward each x epochs
EVAL_EPOCHS = int(1e4)
EVAL_ITERS = 1000
EVAL_FILE = 'dqn_eval.csv'

# Iterations to log tmp file in case of a failure
LOG_TMP = int(1e4)
# Log to csv file each Iters
LOG_CSV = 10
# Logging Files
STATE_FILE = 'dqn_state.txt'
WEIGHTS_FILE = 'dqn_model_weights.pytorch'
TMP_WEIGHTS_OLD = 'dqn_model_weights_tmp_old.pytorch'
TMP_WEIGHTS_NEW = 'dqn_model_weights_tmp_new.pytorch'
TRAINING_FILE = 'dqn_training.csv'
MEMORY_FILE = 'dqn_buffer.pickle'
TMP_MEMORY_OLD = 'dqn_buffer_tmp_old.pickle'
TMP_MEMORY_NEW = 'dqn_buffer_tmp_new.pickle'
ENV_FILE = 'dqn_env.pickle'

### Env Params

In [5]:
# ------------- Env Params ------------
RESET_ENV = EVAL_EPOCHS

D_ENV = {}
for env_type in ('train','eval'):
  env = EnvMakerForestFire(observation_mode='channels3',
                         n_row=9, n_col=9,
                         p_fire=0.009, p_tree=0.300,
                         moves_before_updating=5,
                         reward_fire=-1.00, reward_tree=0.60, reward_empty=-0.30, reward_hit=2.00,
                         reward_type='both')
  D_ENV[env_type] = env

ENV = D_ENV['train']
ENV_EVAL = D_ENV['eval']

# Number of actions
N_ACTIONS = len(ENV.movement_actions)

### DQN Params

In [6]:
# ------------- DQN Params ------------

# N-steps unrolling of the Bellman Equation
STEPS = 3
# Discount
GAMMA = 0.99
# Min possible epsilon on epsilon-greedy policy, exponetial decay
MIN_EPSILON = 0.10
# Exploration EPOCHS
EXPLORATION_EPOCHS = int(1e6)
USE_HEURISTIC = False
# Synchronize target network each x steps
SYNC_TARGET = int(1e4)
# Replay Memory Size
REPLAY_SIZE = int(0.8e6)
# Start learning at this size of the replay memory
REPLAY_START_SIZE = 2000

### Network Params

In [7]:
# ------------- Network Params ------------

# Training
BATCH_SIZE = 16
LEARNING_RATE = 2e-4
# Instantiating the network
def set_global_shape(env):
    grid, pos, moves = env.reset()
    return grid.shape
SHAPE = set_global_shape(ENV)
NET = dqn_model.DQN(SHAPE, N_ACTIONS).to(DEVICE)
print(NET)

DQN(
  (conv): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): GELU()
    (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): GELU()
    (4): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): GELU()
  )
  (fc): Sequential(
    (0): Linear(in_features=2595, out_features=256, bias=True)
    (1): GELU()
    (2): Linear(in_features=256, out_features=9, bias=True)
  )
)


## Resuming Execution and Loss

### Resuming Execution

In [8]:
# ------------- Resuming Execution ------------

params_to_log=\
f"""# FOREST FIRE DQN
# {datetime.datetime.now().strftime('%Y/%B/%d %H:%M:%S')}
#
# ------------- Running Params -------------
# JOB_ITERS = {JOB_ITERS}
# DEVICE = {str(DEVICE)}
#
# ------------- Logging Params ------------
# EVAL_EPOCHS = {EVAL_EPOCHS}
# EVAL_ITERS = {EVAL_ITERS}
# STATE_FILE = {STATE_FILE}
# TRAINING_FILE = {TRAINING_FILE}
# WEIGHTS_FILE = {WEIGHTS_FILE}
# EVAL_FILE = {EVAL_FILE}
#
# ------------- DQN Params ------------
# * N-steps unrolling of the Bellman Equation
# STEPS = {STEPS}
# * Discount
# GAMMA = {GAMMA}
# * Min possible epsilon on epsilon-greedy policy, exponetial decay
# MIN_EPSILON = {MIN_EPSILON}
# EXPLORATION_EPOCHS = {EXPLORATION_EPOCHS}
# USE_HEURISTIC = {USE_HEURISTIC}
# * Synchronize target network each x steps
# SYNC_TARGET = {SYNC_TARGET}
# * Replay Memory Size
# REPLAY_SIZE = {REPLAY_SIZE}
# * Start learning at this size of the replay memory
# REPLAY_START_SIZE = {REPLAY_START_SIZE}
#
# ------------- Network Params ------------
# BATCH_SIZE = {BATCH_SIZE}
# LEARNING_RATE = {LEARNING_RATE}
#
"""

if os.path.isfile(STATE_FILE):
    # Loading the pretrained net
    if os.path.isfile(WEIGHTS_FILE):
        NET.load_state_dict(torch.load(WEIGHTS_FILE, map_location=DEVICE))
    net = NET

    # Training File
    if not os.path.isfile(TRAINING_FILE):
      with open(TRAINING_FILE, 'a') as file:
          file.write('epoch,reward,loss,epsilon\n')

    # Loading memory replay
    if os.path.isfile(MEMORY_FILE):
        with open(MEMORY_FILE, 'rb') as file:
            buffer = pickle.load(file)
        agent = Agent(ENV, buffer)
    else:
        # Initializing Memory and Agent        
        buffer = ReplayMemoryNSteps(REPLAY_SIZE, steps=STEPS)
        agent = Agent(ENV, buffer)
        # Fill the replay memory
        for filling_step in range(REPLAY_START_SIZE):
            agent.play_step(net, epsilon = 1.00, device=DEVICE)
        print(f'\nFilled Replay Memory')

    with open(STATE_FILE, 'r') as file:
        lines = list(file)
        last_line = lines[-1]
        RESUMED_EPOCH = int(last_line.split(',')[0])+1
    with open(STATE_FILE, 'a') as file:
        date = datetime.datetime.now().strftime('%Y,%m,%d,%H,%M,%S')
        file.write(''.join((f'{RESUMED_EPOCH},START,', date, '\n')))
else:
    # Initializing State File
    def comment_string(string, comment='# ', separator='\n'):
        commented = []
        for line in string.split(separator):
            commented.append(comment + line)
        return commented
    env_params = repr(ENV.init_kw_params)
    net_arch = repr(NET)
    with open(STATE_FILE, 'w') as file:
        file.write(params_to_log)
        file.write('# ------------- Environment Params ------------\n')
        commented = comment_string(env_params, separator=',')
        for idx, line in enumerate(commented):
            line = line + ',\n' if idx != len(commented)-1 else line + '\n'
            file.write(line)
        file.write('#\n# ------------- Network Arch ------------\n')
        for line in comment_string(net_arch):
            line += '\n'
            file.write(line)
        file.write('epoch,bound,year,month,day,hour,minute,second\n')
        date = datetime.datetime.now().strftime('%Y,%m,%d,%H,%M,%S')
        file.write(''.join(('0,START,', date, '\n')))

    # Training File
    with open(TRAINING_FILE, 'a') as file:
        file.write('epoch,reward,loss,epsilon\n')

    # Load Weights if available
    if os.path.isfile(WEIGHTS_FILE):
        NET.load_state_dict(torch.load(WEIGHTS_FILE, map_location=DEVICE))
    net = NET
    
    # Initializing Memory and Agent        
    buffer = ReplayMemoryNSteps(REPLAY_SIZE, steps=STEPS)
    agent = Agent(ENV, buffer)

    # Fill the replay memory
    for filling_step in range(REPLAY_START_SIZE):
        agent.play_step(net, epsilon = 1.00, device=DEVICE)
    print(f'\nFilled Replay Memory')
    
    RESUMED_EPOCH = 0

if not os.path.isfile(EVAL_FILE):
    with open(EVAL_FILE, 'a') as file_eval:
      file_eval.write('epoch,mean_eval,mean_train,loss,epsilon\n')


Filled Replay Memory


### Loss

In [9]:
# ------------- Helper Functions ------------

def calc_loss(batch, net, tgt_net, gamma, device='cpu'):
    states, actions, rewards, dones, next_states = batch
    
    grids, positions, moves = observations_to_tensors(states, device=device)
    grids_next, positions_next, moves_next = observations_to_tensors(next_states, device=device)
    
    actions_v = torch.tensor(actions).to(device) - 1
    done_mask = torch.BoolTensor(dones).to(device)

    # Quality of the taken actions
    state_action_values = net(grids, positions, moves).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    
    # Next state max Q(s,a)
    next_state_values = tgt_net(grids_next, positions_next, moves_next).max(1)[0]
    # The value of ended episode is 0
    next_state_values[done_mask] = 0.0
    # Detach in order to not to propagate gradients into tgt network
    next_state_values = next_state_values.detach()
    
    # Calculate target
    targets_batch = calc_batch_targets(rewards, gamma, next_state_values, device)
    return nn.MSELoss()(state_action_values, targets_batch)

# N Steps Bellman Equation
def calc_batch_targets(rewards_batch, gamma, next_state_values, device='cpu'):
    targets_batch = []
    for idx_batch, rewards_steps in enumerate(rewards_batch):
        
        expected_q = 0.0
        for step, reward in enumerate(rewards_steps):
            expected_q += gamma**step * reward
        expected_q += gamma**(step+1) * next_state_values[idx_batch]
        targets_batch.append(expected_q)

    targets_batch = torch.FloatTensor(targets_batch).to(device)
    return targets_batch

## Main program

### Train Loop

In [10]:
# ------------- Main Loop -------------
# Agent for Evaluation
AGENT_EVAL = Agent(ENV_EVAL, ReplayMemoryNSteps(42, steps=STEPS))

# Initializations
tgt_net = dqn_model.DQN(SHAPE, N_ACTIONS).to(DEVICE)
tgt_net.load_state_dict(net.state_dict())

optimizer = optim.Adam(net.parameters(), lr = LEARNING_RATE)

rewards_x_steps = []
epoch = RESUMED_EPOCH
print('STARTING TO LEARN NOW', end='\n\n')
with open(TRAINING_FILE, 'a') as file:
    for i in range(JOB_ITERS):
        # Play a step
        epsilon = get_epsilon(epoch, exploration_epochs=EXPLORATION_EPOCHS, min_epsilon=MIN_EPSILON)
        if USE_HEURISTIC:
          if epoch < EXPLORATION_EPOCHS :
              reward = agent.play_step(net, epsilon, device=DEVICE, policy='heuristic')
          else:
              reward = agent.play_step(net, epsilon, device=DEVICE)
        else:
          reward = agent.play_step(net, epsilon, device=DEVICE)
        rewards_x_steps.append(reward)
        
        # Network Optimization
        optimizer.zero_grad()
        batch = buffer.sample(BATCH_SIZE)
        loss_t = calc_loss(batch, net, tgt_net, gamma=GAMMA, device=DEVICE)
        loss_t.backward()
        optimizer.step()
        
        # Log to CSV
        if epoch % LOG_CSV == 0:
            log_loss = np.round(loss_t.item(), 4)
            log_epsilon = np.round(epsilon, 4)
            log = str(epoch), str(reward), str(log_loss), str(log_epsilon)
            file.write(','.join(log) + '\n')

        if epoch % RESET_ENV == 0:
            agent._reset()
        
        epoch += 1

        if epoch % EVAL_EPOCHS == 0:
          eval_rewards = []
          AGENT_EVAL._reset()
          for i in range(EVAL_ITERS):
            eval_rewards.append(AGENT_EVAL.play_step(net, device=DEVICE))
          eval_rewards_v = np.array(eval_rewards)
          eval_mean_reward = eval_rewards_v.mean()
          print(f'EPOCH: {epoch}')
          print('Mean reward per step (Evaluation): ' + str(np.round(eval_mean_reward,4)))

          rewards_x_steps_v = np.array(rewards_x_steps)
          mean_reward = rewards_x_steps_v.mean()
          print('Mean reward per step (Training): ' + str(np.round(mean_reward,4)) + '\n')
          rewards_x_steps = []

          with open(EVAL_FILE,'a') as eval_file:
            log_loss = np.round(loss_t.item(), 4)
            log_epsilon = np.round(epsilon, 4)
            eval_log = str(epoch), str(np.round(eval_mean_reward,4)), str(np.round(mean_reward,4)), str(log_loss), str(log_epsilon)
            eval_file.write(','.join(eval_log) + '\n')

        # Syncronize net and target net
        if epoch % SYNC_TARGET == 0:
            tgt_net.load_state_dict(net.state_dict())
            
        # Logging temporal file in case of a failure
        if epoch % LOG_TMP == 0:
          if not os.path.isfile(TMP_WEIGHTS_OLD):
            # Temporal Weights
            torch.save(net.state_dict(), TMP_WEIGHTS_OLD)
            shutil.copyfile(TMP_WEIGHTS_OLD, TMP_WEIGHTS_NEW) 
          else:
            os.remove(TMP_WEIGHTS_OLD)
            os.rename(TMP_WEIGHTS_NEW, TMP_WEIGHTS_OLD)
            torch.save(net.state_dict(), TMP_WEIGHTS_NEW)
          if not os.path.isfile(TMP_MEMORY_OLD):
            # Log memory buffer
            with open(TMP_MEMORY_OLD, 'wb') as file2:
                pickle.dump(agent.exp_buffer, file2)
            shutil.copyfile(TMP_MEMORY_OLD, TMP_MEMORY_NEW) 
          else:
            os.remove(TMP_MEMORY_OLD)
            os.rename(TMP_MEMORY_NEW, TMP_MEMORY_OLD)
            with open(TMP_MEMORY_NEW, 'wb') as file2:
                pickle.dump(agent.exp_buffer, file2)


STARTING TO LEARN NOW

EPOCH: 10000
Mean reward per step (Evaluation): -9.5315
Mean reward per step (Training): -9.7181

EPOCH: 20000
Mean reward per step (Evaluation): -8.7795
Mean reward per step (Training): -9.7425

EPOCH: 30000
Mean reward per step (Evaluation): -9.4073
Mean reward per step (Training): -9.7139

EPOCH: 40000
Mean reward per step (Evaluation): -8.4529
Mean reward per step (Training): -10.1263

EPOCH: 50000
Mean reward per step (Evaluation): -7.8312
Mean reward per step (Training): -9.7725

EPOCH: 60000
Mean reward per step (Evaluation): -7.6179
Mean reward per step (Training): -9.9647

EPOCH: 70000
Mean reward per step (Evaluation): -8.9433
Mean reward per step (Training): -9.8793

EPOCH: 80000
Mean reward per step (Evaluation): -10.8728
Mean reward per step (Training): -9.6016

EPOCH: 90000
Mean reward per step (Evaluation): -7.615
Mean reward per step (Training): -9.4254

EPOCH: 100000
Mean reward per step (Evaluation): -4.905
Mean reward per step (Training): -10.0

### Log Final Results

In [11]:
# ------------- Final Loggings -------------

# Saving the learned weights
date = datetime.datetime.now().strftime('%Y_%B_%d_%H-%M-%S')
# Historic Weights
torch.save(net.state_dict(), date + '_' + WEIGHTS_FILE)
print(f'Writing learned weights to {WEIGHTS_FILE}')
# Last Weights
torch.save(net.state_dict(), WEIGHTS_FILE)

# Log final Job Epoch
with open(STATE_FILE, 'a') as file:
        date = datetime.datetime.now().strftime('%Y,%m,%d,%H,%M,%S')
        file.write(''.join((f'{epoch-1},END,', date, '\n')))

# Log memory buffer
with open(MEMORY_FILE, 'wb') as file:
    pickle.dump(agent.exp_buffer, file)
    
# Log Env
with open(ENV_FILE, 'wb') as file:
    pickle.dump(ENV, file)

print(f'Finished {JOB_ITERS} iterations')
print('JOB TERMINATED')

Writing learned weights to dqn_model_weights.pytorch
Finished 1600000 iterations
JOB TERMINATED
