Create environment and agents
---

In [None]:
%matplotlib inline
%run env.py
%run rl-helpers.py

# Create environment:
#   (Q-table) CompassQTable, CompassChargeQTable, LidarCompassQTable, LidarCompassChargeQTable
#   (Grid)    WindowedGridView
env = WindowedGridView(DeliveryDrones(), radius=3)

# Create agent
"""Q-learning agent
my_agent = QLearningAgent(env, gamma=0.99, alpha=0.1, epsilon_start=1, epsilon_decay=0.99, epsilon_end=0.01)
"""

"""DQN with dense Q-network
my_agent = DQNAgent(
    env, DenseQNetworkFactory(env, hidden_layers=[256, 256]),
    gamma=0.95, epsilon_start=0.5, epsilon_decay=0.8, epsilon_end=0.01, memory_size=10000, batch_size=64, target_update_interval=5)
"""

"""DQN with conv. Q-network"""
my_agent = DQNAgent(
    env, ConvQNetworkFactory(env, conv_layers=[
        {'out_channels': 32, 'kernel_size': 3, 'stride': 1, 'padding': 1},
        {'out_channels': 32, 'kernel_size': 3, 'stride': 1, 'padding': 1},
        {'out_channels': 32, 'kernel_size': 3, 'stride': 1, 'padding': 1},
        {'out_channels': 64, 'kernel_size': 3, 'stride': 1, 'padding': 1},
        {'out_channels': 64, 'kernel_size': 3, 'stride': 1, 'padding': 1},
        {'out_channels': 64, 'kernel_size': 3, 'stride': 1, 'padding': 1}
    ], dense_layers=[256]),
    gamma=0.95, epsilon_start=1, epsilon_decay=0.99, epsilon_end=0.01, memory_size=10000, batch_size=64, target_update_interval=5)

Training
---

In [None]:
# Setup custom environment parameters for training
env.env_params.update({'n_drone': 3, 'pickup_reward': 1, 'discharge': 2})

# Reset environment with those parameters
env.reset()

# Setup random opponents
agents = {drone.index: RandomAgent(env) for drone in env.drones}
agents[0] = my_agent

# Create trainer
trainer = MultiAgentTrainer(env, agents, reset_agents=True, seed=0)

In [None]:
my_agent.is_greedy = False

# Train with different grids
for _ in range(10):
    trainer.train(2000) # Calls env.reset() -> new grid
    
    # Reset epsilon
    my_agent.epsilon_start *= 0.99
    my_agent.epsilon = my_agent.epsilon_start

plot_rolling_rewards(trainer.rewards_log, subset=range(0, 5))

Inspect agents
---

In [None]:
from IPython.display import display

# Q-learning agent
if isinstance(my_agent, QLearningAgent):
    # Q-table
    print('Q-table:', my_agent.get_qtable().shape)
    display(my_agent.get_qtable().sample(10))

# For DQN-agent
elif isinstance(my_agent, DQNAgent):
    # Memory replay
    my_agent.inspect_memory()
    
    # Q-network
    print('Q-network:')
    print(my_agent.qnetwork)
    print()
    
# Epsilon decay
plt.plot(my_agent.epsilons)
plt.title('Epsilon decay')
plt.xlabel('Episodes')
plt.ylabel('Epsilon')
plt.show()

# Test with different seeds
my_agent.is_greedy = True
for i in range(10):
    rewards_log = test_agents(env, agents, n_steps=1000, seed=i)
    plot_cumulative_rewards(rewards_log, subset=range(0, 5))

Test agents
---

In [None]:
from IPython.display import clear_output
import time

# Make sure our drone behaves greedily
my_agent.is_greedy = True

# Simulation loop
states = env.reset()
my_drone = env.drones[0]
rewards = None

while True:
    # Render
    clear_output(wait=True)
    print(env.render('ainsi'))

    # Act
    actions = {index: agent.act(states[index]) for index, agent in agents.items()}

    # Print last rewards and next actions
    print('Drone:', my_drone.index, 'charge: {}%'.format(my_drone.charge))
    if hasattr(env, 'format_state'):
        print('Current states:', env.format_state(states[my_drone.index]))
    if hasattr(env, 'format_action'):
        print('Next actions:', env.format_action(actions[my_drone.index]))
    if rewards is not None:
        print('Last rewards:', rewards[my_drone.index])

    # Sleep, step, learn
    time.sleep(1)
    states, rewards, dones, _ = env.step(actions)

Benchmarking
---

In [None]:
%run env.py
%run rl-helpers.py

# Create drones & environment
env = WindowedGridView(DeliveryDrones(env_params={'n_drones': 10+1}), radius=3)
states = env.reset()

# Run drones
for i in tqdm(range(10**6)):
    states, rewards, dones, _  = env.step({drone.index: env.action_space.sample() for drone in env.drones})

Graphics
---

In [None]:
%matplotlib inline
from IPython.display import display
from PIL import Image
import numpy as np
import itertools

In [None]:
# Load RGBA image
sprites_img = Image.open('16ShipCollection.png')
sprites_img_array = np.array(sprites_img)

# Make black background transparent
black_pixels = (sprites_img_array[:, :, 0] + sprites_img_array[:, :, 1] + sprites_img_array[:, :, 2]) == 0
sprites_img_array[np.nonzero(black_pixels) + (3,)] = 0

# Create tiles
def get_ships_tile(row, col):
    tiles_size, small_padding, big_padding = 16, 4, 10
    top_corner = (42, 28)
    
    i = top_corner[0] + row*(tiles_size+small_padding)
    j = top_corner[1] + (col%5)*(tiles_size+small_padding) + (col//5) * (5*(tiles_size+small_padding) + big_padding)
    return Image.fromarray(sprites_img_array[i:i+tiles_size, j:j+tiles_size])

tiles = {
    'packet': get_ships_tile(11, 9),
    'dropzone': get_ships_tile(11, 8),
    'station': get_ships_tile(18, 15),
    'skyscraper': get_ships_tile(18, 12)
}

drones_iter = itertools.product([1, 6, 15, 13, 7, 8, 16, 0, 2, 3, 4, 5, 9, 10, 13, 17], [0, 1, 2, 3, 4])
for index, (i, j) in enumerate(drones_iter):
    label = 'drone_{}'.format(index)
    tiles[label] = get_ships_tile(i, j)
    tiles[label + '_packet'] = get_ships_tile(i, j+10) # red
    tiles[label + '_charging'] = Image.alpha_composite(tiles['dropzone'], get_ships_tile(i, j+15)) # overlay + yellow
    tiles[label + '_over_dropzone'] = Image.alpha_composite(tiles['dropzone'], get_ships_tile(i, j)) # overlay

# Create empty frame
render_padding, tiles_size = 8, 16
frames_size = tiles_size * env.shape[0] + render_padding * (env.shape[0] + 1)
empty_frame = np.full(shape=(frames_size, frames_size, 4), fill_value=0, dtype=np.uint8)
empty_frame[:, :, 3] = 255 # Remove background transparency
    
# Render frame
frame = Image.fromarray(empty_frame.copy())
for i in range(env.shape[0]):
    for j in range(env.shape[1]):
        # Check tile
        ground = env.ground[i, j]
        air = env.air[i, j]
        
        if (air is None) and (ground is None):
            continue # Nothing to draw
        
        if air is None:
            if isinstance(ground, Packet):
                tile = tiles['packet']
            elif isinstance(ground, Dropzone):
                tile = tiles['dropzone']
            elif isinstance(ground, Station):
                tile = tiles['station']
            elif isinstance(ground, Skyscraper):
                tile = tiles['skyscraper']
        else:
            # If air is not None, then it's a drone
            drone = air
            
            if drone.packet is None:
                if ground == None:
                    tile = tiles['drone_{}'.format(drone.index)]
                elif isinstance(ground, Station):
                    tile = tiles['drone_{}_charging'.format(drone.index)]
                elif isinstance(ground, Dropzone):
                    tile = tiles['drone_{}_over_dropzone'.format(drone.index)]
            else:
                tile = tiles['drone_{}_packet'.format(drone.index)]
        
        # Paste tile on frame
        tile_x = j*tiles_size + (j+1)*render_padding
        tile_y = i*tiles_size + (i+1)*render_padding
        frame.paste(tile, (tile_x, tile_y), mask=tile)

# Rescale frame
rescale = lambda old_size: int(old_size * 1)
frame.resize(size=(rescale(frame.size[0]), rescale(frame.size[1])), resample=Image.NEAREST)

In [None]:
print(env.render(mode='ainsi'))

```
-- combinations --
with_packet = red (col+10)
drone_charging = overlay + yellow (col+15)
drone_charging_with_packet = nothing
drone_over_dropzone = overlay

-- Nice to have: events --
just_delivered = drone + (.., ..)
just crashed = (.., ..) / drone + (.., ..) # debris?
```