In [4]:
%load_ext autoreload
%autoreload 2

In [42]:
import d3rlpy
from d3rlpy.dataset import MDPDataset
from d3rlpy.metrics.scorer import average_value_estimation_scorer, td_error_scorer, evaluate_on_environment
from d3rlpy.models.q_functions import QRQFunctionFactory
from d3rlpy.models.encoders import VectorEncoderFactory
from sklearn.model_selection import train_test_split

#libraries for PSO optimization - to discuss with tutor
import pyswarms as ps
from pyswarms.utils.functions import single_obj as fx

import random

import seaborn as sns
import pandas as pd
import itertools
import numpy as np
import warnings

warnings.filterwarnings("ignore")

import os

os.chdir("..")

In [6]:
USE_GPU = False
EPOCHS = 3

#TODO - try with Mujoco environments. Currently Mujoco not working on my mac.
dataset, env = d3rlpy.datasets.get_cartpole()

#we setup a custom encoder. By tuning the parameters we could search for a better model (maybe implement random search)
encoder = VectorEncoderFactory(hidden_units=[300, 400], use_batch_norm = True, activation='tanh', dropout_rate=0.3, use_dense=True)

agent = d3rlpy.algos.DiscreteCQL(q_func_factory='qr', encoder_factory=encoder, use_gpu = USE_GPU)

train_episodes, test_episodes = train_test_split(dataset)

#training returns list of result tuples (epoch, metrics) per epoch.
data = agent.fit(
    dataset = train_episodes,
    eval_episodes = test_episodes,
    n_epochs = EPOCHS,
    scorers = {
        'td_error': td_error_scorer,   #Returns average TD error (how Q function overfits to training set, the smaller the better).
        'value_scale': average_value_estimation_scorer, # If too large, the Q functions overestimate action-values
        'environment': evaluate_on_environment(env)
    }
)

Downloading cartpole.pkl into d3rlpy_data/cartpole_replay_v1.1.0.h5...
2022-05-30 15:19.53 [debug    ] RoundIterator is selected.
2022-05-30 15:19.53 [info     ] Directory is created at d3rlpy_logs/DiscreteCQL_20220530151953
2022-05-30 15:19.53 [debug    ] Building models...
2022-05-30 15:19.53 [debug    ] Models have been built.
2022-05-30 15:19.53 [info     ] Parameters are saved to d3rlpy_logs/DiscreteCQL_20220530151953/params.json params={'action_scaler': None, 'alpha': 1.0, 'batch_size': 32, 'encoder_factory': {'type': 'vector', 'params': {'hidden_units': [300, 400], 'activation': 'tanh', 'use_batch_norm': True, 'dropout_rate': 0.3, 'use_dense': True}}, 'gamma': 0.99, 'generated_maxlen': 100000, 'learning_rate': 6.25e-05, 'n_critics': 1, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'q_func_factory': {'type': 'qr', 'params': {'share_encoder': False, 'n_quantiles': 32}}, 'real_ratio': 1

Epoch 1/3:   0%|          | 0/2335 [00:00<?, ?it/s]

2022-05-30 15:20.17 [info     ] DiscreteCQL_20220530151953: epoch=1 step=2335 epoch=1 metrics={'time_sample_batch': 0.00011720708401861743, 'time_algorithm_update': 0.00864649993193992, 'loss': 7.219594479832455, 'time_step': 0.008873230180597408, 'td_error': 1.0362029242990671, 'value_scale': 0.87362166705126, 'environment': 200.0} step=2335
2022-05-30 15:20.17 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteCQL_20220530151953/model_2335.pt


Epoch 2/3:   0%|          | 0/2335 [00:00<?, ?it/s]

2022-05-30 15:20.41 [info     ] DiscreteCQL_20220530151953: epoch=2 step=4670 epoch=2 metrics={'time_sample_batch': 0.00010366470451273296, 'time_algorithm_update': 0.008969639608640487, 'loss': 3.793903584041228, 'time_step': 0.009160214675315187, 'td_error': 1.0150854121632007, 'value_scale': 0.9647937832399531, 'environment': 200.0} step=4670
2022-05-30 15:20.41 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteCQL_20220530151953/model_4670.pt


Epoch 3/3:   0%|          | 0/2335 [00:00<?, ?it/s]

2022-05-30 15:21.10 [info     ] DiscreteCQL_20220530151953: epoch=3 step=7005 epoch=3 metrics={'time_sample_batch': 0.00010660343047638266, 'time_algorithm_update': 0.009484074610969494, 'loss': 3.47637835130957, 'time_step': 0.009681531142371625, 'td_error': 1.0178133726227743, 'value_scale': 0.9812350331680056, 'environment': 200.0} step=7005
2022-05-30 15:21.10 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteCQL_20220530151953/model_7005.pt


In [43]:
SIZE_AUG = 100
ITER = 100
eps = 0.1

"""
Tentative of adversarial state training implementation
Not being able at the moment to access the gradient of the value function, I considered a simplified version of the problem.
We perform a random search over the value function space, and we keep the worst observation to use for the new dataset.

We collect a sample of observations (size: SIZE_AUG), and we compute the state value, by using Montecarlo integration over the action space. 
    value = 1/N * [Q(state, action_1) + Q(state, action_2) + ... + Q(state, action_3)]
We iterate over all the observation: we perturb the observation state by adding gaussian noise and we compute the value function, as described above.
Finally we compare the obtained value with the worst value (initially set to the optimal state value)
We do this for ITER number of iterations.

"""

#fetch #SIZE_AUG observations (np array) from dataset
observations = dataset.observations

#find optimal Q-value for each observation (action is sampled from policy)
#in the continuous case we could sample N actions, and average (kind of Montecarlo approx over the actions)
values = 1/2 * (agent.predict_value(observations[:SIZE_AUG,:], np.zeros(SIZE_AUG) + agent.predict_value(observations[:SIZE_AUG,:], np.ones(SIZE_AUG))))

for  i, obs in enumerate(observations[:SIZE_AUG,:]):
    print(i)
    worst_value = values[i]
    for x in range(ITER):
        #sample a state in the epsilon ball, and an action.
        perturbed_state = (obs + np.random.normal(loc = 0, scale = eps, size=obs.shape)).reshape(1,-1)
        
        #compute the state value for the 
        perturbed_value = 1/2 * (agent.predict_value(perturbed_state, np.array([0])) + agent.predict_value(perturbed_state, np.array([1])))
        if perturbed_value < worst_value:
            worst_value = perturbed_value
            worst_obs = perturbed_state
    observations[i] = worst_obs


dataset_aug = MDPDataset(
    observations, 
    dataset.actions, 
    dataset.rewards, 
    dataset.terminals
)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [44]:
encoder_aug = VectorEncoderFactory(hidden_units=[300, 400], use_batch_norm = True, activation='tanh', dropout_rate=0.3, use_dense=True)

agent_aug = d3rlpy.algos.DiscreteCQL(q_func_factory='qr', encoder_factory=encoder_aug, use_gpu = USE_GPU)

train_episodes, test_episodes = train_test_split(dataset_aug)

#training returns list of result tuples (epoch, metrics) per epoch.
data = agent.fit(
    dataset = train_episodes,
    eval_episodes = test_episodes,
    n_epochs = EPOCHS,
    scorers = {
        'td_error': td_error_scorer,   #Returns average TD error (how Q function overfits to training set, the smaller the better).
        'value_scale': average_value_estimation_scorer, # If too large, the Q functions overestimate action-values
        'environment': evaluate_on_environment(env)
    }
)

2022-05-30 16:30.12 [debug    ] RoundIterator is selected.
2022-05-30 16:30.12 [info     ] Directory is created at d3rlpy_logs/DiscreteCQL_20220530163012
2022-05-30 16:30.12 [info     ] Parameters are saved to d3rlpy_logs/DiscreteCQL_20220530163012/params.json params={'action_scaler': None, 'alpha': 1.0, 'batch_size': 32, 'encoder_factory': {'type': 'vector', 'params': {'hidden_units': [300, 400], 'activation': 'tanh', 'use_batch_norm': True, 'dropout_rate': 0.3, 'use_dense': True}}, 'gamma': 0.99, 'generated_maxlen': 100000, 'learning_rate': 6.25e-05, 'n_critics': 1, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'q_func_factory': {'type': 'qr', 'params': {'share_encoder': False, 'n_quantiles': 32}}, 'real_ratio': 1.0, 'reward_scaler': None, 'scaler': None, 'target_update_interval': 8000, 'use_gpu': None, 'algorithm': 'DiscreteCQL', 'observation_shape': (4,), 'action_size': 2}


Epoch 1/3:   0%|          | 0/2469 [00:00<?, ?it/s]

2022-05-30 16:30.37 [info     ] DiscreteCQL_20220530163012: epoch=1 step=2469 epoch=1 metrics={'time_sample_batch': 9.79771928876071e-05, 'time_algorithm_update': 0.008146137494025032, 'loss': 3.1916634771880856, 'time_step': 0.008330075931046935, 'td_error': 1.006930780789414, 'value_scale': 1.9722501374804455, 'environment': 200.0} step=2469
2022-05-30 16:30.37 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteCQL_20220530163012/model_2469.pt


Epoch 2/3:   0%|          | 0/2469 [00:00<?, ?it/s]

2022-05-30 16:31.02 [info     ] DiscreteCQL_20220530163012: epoch=2 step=4938 epoch=2 metrics={'time_sample_batch': 0.00010064363769308764, 'time_algorithm_update': 0.00872278667355704, 'loss': 2.8635760621533985, 'time_step': 0.008908869815577664, 'td_error': 0.9993534448050594, 'value_scale': 1.954566751837541, 'environment': 200.0} step=4938
2022-05-30 16:31.02 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteCQL_20220530163012/model_4938.pt


Epoch 3/3:   0%|          | 0/2469 [00:00<?, ?it/s]

2022-05-30 16:31.26 [info     ] DiscreteCQL_20220530163012: epoch=3 step=7407 epoch=3 metrics={'time_sample_batch': 9.559435900428991e-05, 'time_algorithm_update': 0.008054034146957429, 'loss': 2.7956504975315433, 'time_step': 0.00823249349520727, 'td_error': 0.9943116620080363, 'value_scale': 1.9583272648381285, 'environment': 200.0} step=7407
2022-05-30 16:31.26 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteCQL_20220530163012/model_7407.pt
