# DQN with Ensembling

As a personal curiosity, I'd like to try to squeeze as much performance as possible from the initial DQN application.

I intend to do so using ensembling: train multiple agents with the same video and different subjects and then use **majority-voting** at test time, to see if we can get any performance improvements, even with an extremely simple approach!

In [1]:
import tianshou as ts 
from tianshou.utils import TensorboardLogger

import torch
from torch import nn
from torch.utils.tensorboard import SummaryWriter

import numpy as np

import os
from datetime import datetime

In [2]:
from utils_preprocess import compute_frame_features, compute_foa_features

from env_base import BaseEnvironment
from env_base_test import BaseTestEnvironment

  from pkg_resources import resource_stream, resource_exists


## Data and environment initialisation

In [17]:
vid_filename = "012"
mat_filename = vid_filename + ".mat"

n_subjects = 10 # pick the first ten subjects (out of 39)
subjects = [n for n in range(n_subjects)]

In [18]:
patch_bounding_boxes_per_frame, patch_centres_per_frame, speaker_info_per_frame = compute_frame_features(
    vid_filename
)

foa_centres_per_frame_per_subject, patch_weights_per_frame = compute_foa_features(
    mat_filename, patch_centres_per_frame
)

foa_centres_per_frame_subjects = [
    [frame[target_subject] for frame in foa_centres_per_frame_per_subject] 
    for target_subject in subjects
    ]

In [19]:
markov_envs = []
for subject in subjects:
    markov_env = BaseEnvironment(
        1,
        patch_bounding_boxes_per_frame,
        patch_centres_per_frame,
        speaker_info_per_frame,
        foa_centres_per_frame_subjects[subject],
        patch_weights_per_frame,
        frame_width=320, # from data_utils.py
        frame_height=180,
    )
    markov_envs.append(markov_env)

In [20]:
all_train_envs = []
all_test_envs = []
for markov_env in markov_envs:
    num_train_envs = 5
    num_test_envs = 10

    train_envs = ts.env.DummyVectorEnv([lambda: markov_env for _ in range(num_train_envs)])
    all_train_envs.append(train_envs)

    test_envs = ts.env.DummyVectorEnv([lambda: markov_env for _ in range(num_test_envs)])
    all_test_envs.append(test_envs)

## DQN

First, let's construct the network.

The biggest headache comes from the observations: they're quite complex. So, we build multiple networks, each processing a part of an observation and combining their outputs in the end!

In [21]:
class Net(nn.Module):
    def __init__(self, observation_space, action_shape):
        super().__init__()

        self.num_patches = observation_space['patch_centres'].shape[0]

        # network for patch_centres
        self.patch_centres_net = nn.Sequential(
            nn.Linear(np.prod(observation_space['patch_centres'].shape), 64),
            nn.ReLU(inplace=True),
            nn.Linear(64, 64),
            nn.ReLU(inplace=True)
        )

        # network for patch_bounding_boxes
        self.patch_bboxes_net = nn.Sequential(
            nn.Linear(np.prod(observation_space['patch_bounding_boxes'].shape), 64),
            nn.ReLU(inplace=True),
            nn.Linear(64, 64),
            nn.ReLU(inplace=True)
        )

        # network for speaker_info
        self.speaker_info_net = nn.Sequential(
            nn.Linear(np.prod(observation_space['speaker_info'].shape), 32),
            nn.ReLU(inplace=True),
            nn.Linear(32, 32),
            nn.ReLU(inplace=True)
        )

        # combining the outputs of all networks
        self.combined_net = nn.Sequential(
            nn.Linear(64 + 64 + 32, 128),
            nn.ReLU(inplace=True),
            nn.Linear(128, 128),
            nn.ReLU(inplace=True),
            nn.Linear(128, np.prod(action_shape))
        )

    def forward(self, obs, state=None, info={}):
        patch_centres = torch.tensor(obs['patch_centres'], dtype=torch.float32)
        patch_bboxes = torch.tensor(obs['patch_bounding_boxes'], dtype=torch.float32)
        speaker_info = torch.tensor(obs['speaker_info'], dtype=torch.float32)

        patch_centres = patch_centres.view(patch_centres.size(0), -1)
        patch_bboxes = patch_bboxes.view(patch_bboxes.size(0), -1)
        speaker_info = speaker_info.view(speaker_info.size(0), -1)

        # pass through respective networks
        patch_centres_out = self.patch_centres_net(patch_centres)
        patch_bboxes_out = self.patch_bboxes_net(patch_bboxes)
        speaker_info_out = self.speaker_info_net(speaker_info)

        # combine outputs
        combined = torch.cat([patch_centres_out, patch_bboxes_out, speaker_info_out], dim=1)

        logits = self.combined_net(combined)

        return logits, state

In [22]:
nets = []
optims = []
for markov_env in markov_envs:
    state_shape = markov_env.observation_space
    action_shape = markov_env.action_space.n

    net = Net(state_shape, action_shape)
    nets.append(net)

    optim = torch.optim.Adam(net.parameters(), lr=1e-3)
    optims.append(optim)

## Setting up DQN

First, we need to set up the policy, which is readily done in Tianshou.

In [23]:
policies = []
for net, optim in zip(nets, optims):
    policy = ts.policy.DQNPolicy(
        model=net, 
        optim=optim, 
        discount_factor=0.99,
        estimation_step=1,
        target_update_freq=50
    )
    policies.append(policy)

In [24]:
train_collectors = []
test_collectors = []
for policy, train_env, test_envs in zip(policies, all_train_envs, all_test_envs):
    train_collector = ts.data.Collector(policy, train_envs, ts.data.VectorReplayBuffer(1000, num_train_envs))
    train_collectors.append(train_collector)

    test_collector = ts.data.Collector(policy, test_envs)
    test_collectors.append(test_collector)

## Training

In [25]:
num_epochs = 2
num_steps_per_epoch = 1000
step_per_collect = 10
episode_per_test = 5
batch_size = 30 # one second of data (videos are at 30FPS)

timestamp = datetime.now().strftime("%d%m%Y-%H%M%S")
log_path = os.path.join("logs", "dqn", "base", f"video_{vid_filename}", f"ensemble_{n_subjects}", timestamp)
writer = SummaryWriter(log_path)
logger = TensorboardLogger(writer)

In [26]:
results = []
for subject, (policy, train_collector, test_collector) in enumerate(zip(policies, train_collectors, test_collectors)):
    print(f"\n[+] Training agent on subject {subject}...\n")
    result = ts.trainer.offpolicy_trainer(
        policy, 
        train_collector, 
        test_collector,
        max_epoch=num_epochs,
        step_per_epoch=num_steps_per_epoch,
        step_per_collect=step_per_collect,
        episode_per_test=episode_per_test,
        batch_size=batch_size,
        logger=logger,
    )
    results.append(result)


[+] Training agent on subject 0...



Epoch #1: 1001it [00:01, 504.21it/s, env_step=1000, len=79, loss=0.189, n/ep=0, n/st=10, rew=15.57]                          


Epoch #1: test_reward: 66.027011 ± 35.927479, best_reward: 66.027011 ± 36.117924 in #0


Epoch #2: 1001it [00:01, 507.31it/s, env_step=2000, len=79, loss=0.263, n/ep=0, n/st=10, rew=15.65]                          


Epoch #2: test_reward: 66.027011 ± 35.078453, best_reward: 66.027011 ± 36.117924 in #0

[+] Training agent on subject 1...



Epoch #1: 1001it [00:01, 509.47it/s, env_step=1000, len=79, loss=0.179, n/ep=0, n/st=10, rew=7.31]                          


Epoch #1: test_reward: 67.202328 ± 34.142491, best_reward: 80.680116 ± 46.689854 in #0


Epoch #2: 1001it [00:01, 511.11it/s, env_step=2000, len=79, loss=0.308, n/ep=0, n/st=10, rew=13.44]                          


Epoch #2: test_reward: 91.112050 ± 46.631109, best_reward: 91.112050 ± 46.631109 in #2

[+] Training agent on subject 2...



Epoch #1: 1001it [00:02, 482.47it/s, env_step=1000, len=79, loss=2.125, n/ep=0, n/st=10, rew=109.81]                          


Epoch #1: test_reward: 264.982980 ± 135.818823, best_reward: 264.982980 ± 135.818823 in #1


Epoch #2: 1001it [00:02, 470.37it/s, env_step=2000, len=79, loss=1.869, n/ep=0, n/st=10, rew=107.02]                          


Epoch #2: test_reward: 264.982980 ± 133.215542, best_reward: 264.982980 ± 135.818823 in #1

[+] Training agent on subject 3...



Epoch #1: 1001it [00:02, 470.88it/s, env_step=1000, len=79, loss=1.735, n/ep=0, n/st=10, rew=110.59]                          


Epoch #1: test_reward: 166.800943 ± 84.879772, best_reward: 166.800943 ± 84.879772 in #1


Epoch #2: 1001it [00:02, 443.33it/s, env_step=2000, len=79, loss=2.969, n/ep=0, n/st=10, rew=84.28]                          


Epoch #2: test_reward: 166.800943 ± 91.898487, best_reward: 166.800943 ± 84.879772 in #1

[+] Training agent on subject 4...



Epoch #1: 1001it [00:02, 475.94it/s, env_step=1000, len=79, loss=0.302, n/ep=0, n/st=10, rew=13.83]                          


Epoch #1: test_reward: 92.854483 ± 49.966889, best_reward: 92.854483 ± 46.689806 in #0


Epoch #2: 1001it [00:02, 479.65it/s, env_step=2000, len=79, loss=0.422, n/ep=0, n/st=10, rew=3.61]                          


Epoch #2: test_reward: 92.854483 ± 48.302384, best_reward: 92.854483 ± 46.689806 in #0

[+] Training agent on subject 5...



Epoch #1: 1001it [00:02, 459.23it/s, env_step=1000, len=79, loss=0.185, n/ep=0, n/st=10, rew=7.78]                          


Epoch #1: test_reward: 54.517526 ± 30.803902, best_reward: 54.517526 ± 30.803902 in #1


Epoch #2: 1001it [00:02, 472.45it/s, env_step=2000, len=79, loss=0.257, n/ep=0, n/st=10, rew=4.40]                          


Epoch #2: test_reward: 53.448926 ± 28.471738, best_reward: 54.517526 ± 30.803902 in #1

[+] Training agent on subject 6...



Epoch #1: 1001it [00:02, 468.86it/s, env_step=1000, len=79, loss=0.194, n/ep=0, n/st=10, rew=7.51]                          


Epoch #1: test_reward: 51.825853 ± 23.482397, best_reward: 51.825853 ± 23.482397 in #1


Epoch #2: 1001it [00:02, 464.77it/s, env_step=2000, len=79, loss=0.308, n/ep=0, n/st=10, rew=8.34]                          


Epoch #2: test_reward: 59.927268 ± 33.369349, best_reward: 59.927268 ± 33.369349 in #2

[+] Training agent on subject 7...



Epoch #1: 1001it [00:02, 468.19it/s, env_step=1000, len=79, loss=1.482, n/ep=0, n/st=10, rew=111.06]                          


Epoch #1: test_reward: 219.137024 ± 111.328033, best_reward: 219.137024 ± 115.344070 in #0


Epoch #2: 1001it [00:02, 474.81it/s, env_step=2000, len=79, loss=4.837, n/ep=0, n/st=10, rew=107.48]                          


Epoch #2: test_reward: 219.137024 ± 107.479633, best_reward: 219.137024 ± 115.344070 in #0

[+] Training agent on subject 8...



Epoch #1: 1001it [00:02, 461.69it/s, env_step=1000, len=79, loss=2.215, n/ep=0, n/st=10, rew=110.48]                          


Epoch #1: test_reward: 286.436860 ± 147.444238, best_reward: 286.436860 ± 147.444238 in #1


Epoch #2: 1001it [00:02, 465.69it/s, env_step=2000, len=79, loss=6.717, n/ep=0, n/st=10, rew=108.06]                          


Epoch #2: test_reward: 24.787851 ± 11.405171, best_reward: 286.436860 ± 147.444238 in #1

[+] Training agent on subject 9...



Epoch #1: 1001it [00:02, 467.00it/s, env_step=1000, len=79, loss=1.405, n/ep=0, n/st=10, rew=112.19]                          


Epoch #1: test_reward: 313.585106 ± 156.962648, best_reward: 313.585106 ± 156.962648 in #1


Epoch #2: 1001it [00:02, 468.95it/s, env_step=2000, len=79, loss=3.238, n/ep=0, n/st=10, rew=95.56]                           


Epoch #2: test_reward: 313.585106 ± 160.558441, best_reward: 313.585106 ± 156.962648 in #1


In [27]:
results

[{'duration': '4.36s',
  'train_time/model': '3.71s',
  'test_step': 3561,
  'test_episode': 15,
  'test_time': '0.39s',
  'test_speed': '9109.82 step/s',
  'best_reward': 66.02701078146637,
  'best_result': '66.03 ± 36.12',
  'train_step': 2000,
  'train_episode': 5,
  'train_time/collector': '0.26s',
  'train_speed': '504.26 step/s'},
 {'duration': '4.29s',
  'train_time/model': '3.68s',
  'test_step': 3561,
  'test_episode': 15,
  'test_time': '0.36s',
  'test_speed': '9952.51 step/s',
  'best_reward': 91.11205016798922,
  'best_result': '91.11 ± 46.63',
  'train_step': 2000,
  'train_episode': 5,
  'train_time/collector': '0.25s',
  'train_speed': '509.19 step/s'},
 {'duration': '4.59s',
  'train_time/model': '3.94s',
  'test_step': 3561,
  'test_episode': 15,
  'test_time': '0.38s',
  'test_speed': '9316.81 step/s',
  'best_reward': 264.98298045633055,
  'best_result': '264.98 ± 135.82',
  'train_step': 2000,
  'train_episode': 5,
  'train_time/collector': '0.26s',
  'train_speed'

## Testing

In [28]:
test_markov_envs = []
for subject in subjects:
    markov_env = BaseTestEnvironment(
        1,
        patch_bounding_boxes_per_frame,
        patch_centres_per_frame,
        speaker_info_per_frame,
        foa_centres_per_frame_subjects[subject],
        patch_weights_per_frame,
        frame_width=320, # from data_utils.py
        frame_height=180,
    )
    test_markov_envs.append(markov_env)

In [29]:
testing_envs = []
for test_markov_env in test_markov_envs:
    num_test_envs = 1 # need set it to 1 (else it doesn't get to the end)

    test_envs = ts.env.DummyVectorEnv([lambda: test_markov_env for _ in range(num_test_envs)])
    testing_envs.append(test_envs)

In [30]:
#! notice that the policy is unchanged
for policy, test_envs in zip(policies, testing_envs):
    policy.eval()
    policy.set_eps(0.05)

    collector = ts.data.Collector(policy, test_envs)
    # should be the same values 3 times (if not, there's a problem)
    print(collector.collect(n_episode=3))

{'n/ep': 10, 'n/st': 1192, 'rews': array([12., 10., 10.,  8.,  8.,  7., 33., 32., 72., 75.]), 'lens': array([ 40,  40,  40,  40,  40,  40, 139, 139, 337, 337]), 'idxs': array([4, 5, 6, 7, 8, 9, 2, 3, 0, 1]), 'rew': 26.7, 'len': 119.2, 'rew_std': 25.127872970070506, 'len_std': 115.45284751793695}
{'n/ep': 10, 'n/st': 1192, 'rews': array([ 12.,  12.,  12.,  12.,  12.,  12.,  39.,  41., 102.,  97.]), 'lens': array([ 40,  40,  40,  40,  40,  40, 139, 139, 337, 337]), 'idxs': array([4, 5, 6, 7, 8, 9, 2, 3, 0, 1]), 'rew': 35.1, 'len': 119.2, 'rew_std': 33.99838231445726, 'len_std': 115.45284751793695}
{'n/ep': 10, 'n/st': 1192, 'rews': array([ 28.,  28.,  28.,  28.,  27.,  28.,  96.,  99., 241., 243.]), 'lens': array([ 40,  40,  40,  40,  40,  40, 139, 139, 337, 337]), 'idxs': array([4, 5, 6, 7, 8, 9, 2, 3, 0, 1]), 'rew': 84.6, 'len': 119.2, 'rew_std': 83.20120191439544, 'len_std': 115.45284751793695}
{'n/ep': 10, 'n/st': 1192, 'rews': array([ 19.,  20.,  19.,  17.,  16.,  15.,  63.,  61., 1