## Imports

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models, transforms
import gc

import random
from tqdm import tqdm
import numpy as np
import cv2
import matplotlib.pyplot as plt

from simulation import Simulator, coordinate

## Simulator Setup

In [9]:
def reset_sim():
    
    # Units are pixels for resolution, degrees for fov, degrees for angle, and pixels for height.
    cameraSettings = {
        "resolution": (1920, 1080),
        "fov": {"diagonal": random.uniform(74, 80)}, # realsense diagonal fov is 77 degrees IIRC
        "angle": {"roll": random.uniform(-5, 5), "pitch": random.uniform(10, 20), "yaw": random.uniform(-5, 5)}, # don't go too crazy with these, my code should be good up to like... 45 degrees probably? But the math gets unstable
        "height": random.uniform(58, 74) # 8 pixels/inch - represents how high up the camera is relative to the road
    }

    mapParameters = {
    "loops": 1,
    "size": (6, 6),
    "expansions": 5,
    "complications": 4
    }

    # Can also pass car parameters for max/min speed, etc
    carParameters = {
        "wheelbase": random.uniform(5.5, 7.5), # inches, influences how quickly the steering will turn the car.  Larger = slower
        "maxSteering": 30.0, # degrees, extreme (+ and -) values of steering
        "steeringOffset": random.uniform(-.5, .5), # degrees, since the car is rarely perfectly aligned
        "minVelocity": 0.0, # pixels/second, slower than this doesn't move at all.
        "maxVelocity": 480.0, # pixels/second, 8 pixels/inch, so if the car can move 5 fps that gives us 480 pixels/s top speed
    }

    sim = Simulator(cameraSettings=cameraSettings)
    
    # startLocation = (random.randint(0, 5), random.randint(0, 5), 0, random.randint(0, 2))
    # random seed for consistent maps
    # can also pass a start location if you know the code: (y tile index, x tile index, position index, direction index)
    # - position index is from 0-(number of connections the tile has - 1), so a straight is 0 or 1, a t is 0, 1, or 2.
    # - direction index is 0 or 1 for normal or reversed.
    sim.start(mapSeed='real', mapParameters=mapParameters, carParameters=carParameters, startPoint=(0,4,0,0))
    where, facing = sim.RealSense.parent.ackermann.pose()
    initial_img = sim.RealSense.camera.getImage(where, facing)
    return sim, initial_img

From here, the API for using the simulation is as follows. Steps the entire simulation, returns image, reward from sim.getReward() and a done bool (and we can change what 'done' means. Currently its if reward is negative):

```python
frame, reward, done = sim.step(steer, speed, display=False) 
```

In order to reset the simulation, you just need to reconstruct the sim object and start it, using the reset_sim() function above.

# DQN

## Part 1

Deep Q-Network (https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) is a Q-learning algorithm that learns values for state-action pairs.

Actions are sampled according to an $\epsilon-greedy$ policy to help with exploration of the state space. Every time an action is sampled, the agent chooses a random action with $\epsilon$ probability. Otherwise, the agent selects the action with the highest Q-value for a state. $\epsilon$ decays over time according to $\epsilon \gets \epsilon * epsilon\_decay$.

Tuples of state, action, reward, next_state, and terminal $(s,a,r,s',d)$ are collected during training. Every $learn\_frequency$ steps $sample\_size$ tuples are sampled and made into 5 tensors tensors of states, actions, rewarads, next_states, and terminals.

The loss for a batch of size N is given below.

$Loss=\frac{1}{N}\sum \bigg(Q(s,a) - (r + \gamma \underset{a'\sim A}{max} \hat{Q}(s',a')(1-d))\bigg)^2 $

Loss is calculated and used to update the Q-Network. The target network $\hat{Q}$ begins as a copy of the Q network but is not updated by the optimizer. Every $target\_update$ steps, the target network is updated with the parameters of the Q-Network. This process is a type of bootstrapping.

In [10]:
# Q-Value Network
class QNetwork(nn.Module):
  def __init__(self, feature_size=20, action_size=7):
    super().__init__()
    
    hidden_size = 50
    
    self.resnet18 = models.resnet18(weights='ResNet18_Weights.DEFAULT')
    self.resnet18.fc = nn.Linear(in_features=self.resnet18.fc.in_features, out_features=feature_size)

    self.label = nn.Linear(feature_size, action_size)


    # self.lstm = nn.LSTM(feature_size, hidden_size, num_layers=1, batch_first=True)
    # self.label = nn.Linear(hidden_size, action_size)

    # self.prev_hidden_state = torch.zeros(self.lstm.num_layers, hidden_size).cuda()
    # self.prev_cell_state =  torch.zeros(self.lstm.num_layers, hidden_size).cuda()
    
  def forward(self, img_batch):
    """Estimate q-values given image

      Args:
          img batch (4d tensor): size (batch_size, height, width, channel)

      Returns:
          q-values (tensor): estimated q-values, size (batch x action_size)
    """
    img_batch = transforms.functional.convert_image_dtype(img_batch).permute([0, 3, 1, 2])
    features = self.resnet18(img_batch)

    # output, (final_hidden_state, final_cell_state) = self.lstm(features, (self.prev_hidden_state, self.prev_cell_state))

    # self.prev_hidden_state = final_hidden_state.detach()
    # self.prev_cell_state = final_cell_state.detach()

    # values = self.label(output)
    values = self.label(features)
    return values

In [11]:
def get_action_dqn(network, state, epsilon, epsilon_decay):
  """Select action according to e-greedy policy and decay epsilon

    Args:
        network (QNetwork): Q-Network
        state (np-array): current state, size (state_size)
        epsilon (float): probability of choosing a random action
        epsilon_decay (float): amount by which to decay epsilon

    Returns:
        action (int): chosen action [0, action_size)
        epsilon (float): decayed epsilon
  """
  
  if random.uniform(0., 1.) < epsilon:
    action = random.randint(0,6) #randint 0-6 corresponding to [-30,-20,-10,0,10,20,30] degrees
  else:
    with torch.no_grad():
      state_tensor = torch.Tensor(state).float().cuda()
      action = int(np.argmax(network(state_tensor.unsqueeze(0)).cpu()))
      # del state_tensor
  return action, epsilon*epsilon_decay


def prepare_batch(frame_buffer, action_idx_buffer, next_frame_buffer, reward_buffer, done_buffer, batch_size):
  """Randomly sample batch from memory
     Prepare cuda tensors

    Args:
        memory (list): state, action, next_state, reward, done tuples
        batch_size (int): amount of memory to sample into a batch

    Returns:
        state (tensor): float cuda tensor of size (batch_size x state_size)
        action (tensor): long tensor of size (batch_size)
        next_state (tensor): float cuda tensor of size (batch_size x state_size)
        reward (tensor): float cuda tensor of size (batch_size)
        done (tensor): float cuda tensor of size (batch_size)
  """

  idx_array = np.random.randint(0, len(frame_buffer), batch_size)


  frame = np.array([frame_buffer[i] for i in idx_array])
  action = np.array([action_idx_buffer[i] for i in idx_array])
  next_frame = np.array([next_frame_buffer[i] for i in idx_array])
  reward = np.array([reward_buffer[i] for i in idx_array])
  done = np.array([done_buffer[i] for i in idx_array])

  # frame = np.array(frame_buffer[idx_array])
  # action = np.array(action_idx_buffer[idx_array])
  # next_frame = np.array(next_frame_buffer[idx_array])
  # reward = np.array(reward_buffer[idx_array])
  # done = np.array(done_buffer[idx_array])

  return torch.FloatTensor(frame).cuda(), torch.FloatTensor(action).cuda(),torch.FloatTensor(next_frame).cuda(), \
          torch.FloatTensor(reward).cuda(), torch.FloatTensor(done).cuda()
  
def learn_dqn(trajectory, optim, q_network, target_network, gamma, episode, target_update):
  """Update Q-Network according to DQN Loss function
     Update Target Network every target_update global steps

    Args:
        batch (tuple): tuple of state, action, next_state, reward, and done tensors
        optim (Adam): Q-Network optimizer
        q_network (QNetwork): Q-Network
        target_network (QNetwork): Target Q-Network
        gamma (float): discount factor
        episode (int): total steps taken in environment
        target_update (int): frequency of target network update
  """
  optim.zero_grad()
  total_loss = torch.tensor([0.0], requires_grad=True).cuda()

  state = trajectory[0]
  action =trajectory[1]
  next_state = trajectory[2]
  reward = trajectory[3] 
  done = trajectory[4]

  # Sequentially loop through the episode trajectory
  action = torch.unsqueeze(action,dim=0).long()

  Q = torch.gather(q_network(state), 1, action)
  Q_hat = torch.max(target_network(next_state), dim=1)[0]
  b = (reward + gamma*Q_hat*(1 - done))

  Q = torch.squeeze(Q)
  
  total_loss = F.mse_loss(Q, b)

  total_loss.backward()
  optim.step()

  if episode % target_update == 0:
    target_network.load_state_dict(q_network.state_dict())


### Main

In [12]:
def dqn_main():
  gc.collect()
  torch.cuda.empty_cache()

  # Hyper parameters
  episodes = 10000
  target_update = 250
  save_frequency = 250  
  MAX_EPISODE_LENGTH = 200
  SAVE_REWARD_THRESHOLD = 20
  LEARN_FREQUENCY = 25
  START_TRAINING = 200
  BATCH_SIZE = 32

  CAR_SPEED = 1.5

  lr = 1e-3

  gamma = 0.99
  epsilon = 1
  epsilon_decay = .999
  img_size = (128, 72)

  # Init networks
  q_network = QNetwork().cuda()
  target_network = QNetwork().cuda()
  target_network.load_state_dict(q_network.state_dict()) #copy q_network into target_network

  # Init optimizer
  optim = torch.optim.Adam(q_network.parameters(), lr=lr)

  # Begin main loop
  results_dqn = []
  global_step = 0
  prev_avg_reward = 0.0
  avg_reward = 20.0
  loop = tqdm(total=episodes, position=0, leave=False)
  action_space = [-30,-20,-10,0,10,20,30]

  # Init episode replay buffer
  frame_buffer = []
  action_idx_buffer = []
  next_frame_buffer = []
  reward_buffer = []
  done_buffer = []

  for episode in range(1, episodes):

    # Reset environment
    sim, frame = reset_sim()
    done = False
    cum_reward = 0  # Track cumulative reward per episode

    frame = cv2.resize(frame, img_size)

    # Begin episode
    while not done and cum_reward < MAX_EPISODE_LENGTH:  # End after 200 steps 
      # Select e-greedy action
      action_idx, epsilon = get_action_dqn(q_network, frame, epsilon, epsilon_decay)

      # Take step
      next_frame, reward, done = sim.step(steer=action_space[action_idx], speed=CAR_SPEED, display=False)

      # Store step in replay bufferimg
      next_frame = cv2.resize(next_frame, img_size)

      frame_buffer.append(next_frame)
      action_idx_buffer.append(action_idx)
      next_frame_buffer.append(next_frame)
      reward_buffer.append(reward)
      done_buffer.append(done)

      cum_reward += reward
      frame = next_frame  # Set current frame
      global_step += 1

      if (global_step > START_TRAINING) and (global_step % LEARN_FREQUENCY == 0):
        batch = prepare_batch(frame_buffer, action_idx_buffer, next_frame_buffer, reward_buffer, done_buffer, BATCH_SIZE)  # Train the network after episode ended
        learn_dqn(batch, optim, q_network, target_network, gamma, episode, target_update)     # Train
        del batch

    avg_reward += cum_reward

    # Save the network if save_frequency steps has passed and it is better than the previous avg_reward
    if (episode % save_frequency == 0):
      avg_reward /= save_frequency
      if (avg_reward > prev_avg_reward + SAVE_REWARD_THRESHOLD):
        print(f"{save_frequency} episodes completed. Average Reward: {avg_reward} > Previous Average: {prev_avg_reward}")
        torch.save(q_network, f'./rl_models/model{episode}.pt')
      prev_avg_reward = avg_reward
      avg_reward = 0.0

    # Print results at end of episode
    r = torch.cuda.memory_reserved(0)
    a = torch.cuda.memory_allocated(0)
    results_dqn.append(cum_reward)
    loop.update(1)
    loop.set_description('Episodes: {} Reward: {} Allocated Memory: {} Reserved Memory {}'.format(episode, cum_reward, a / 1e9, r / 1e9))
  
  return q_network, results_dqn, prev_avg_reward

In [13]:
trained_model, results_dqn, avg_reward = dqn_main()

plt.plot(results_dqn)
plt.show()

print(f'Final Average Reward: {avg_reward}')

Episodes: 5 Reward: 50.0 Allocated Memory: 0.182882304 Reserved Memory 0.226492416:   0%|          | 5/10000 [00:01<48:38,  3.42it/s]  

TypeError: only integer scalar arrays can be converted to a scalar index