In [1]:
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import matplotlib.pyplot as plt

  '{0}.{1}.{2}'.format(*version.hdf5_built_version_tuple)


In [37]:
class CommNet:
    
    def __init__(self, sess, N, J, embedding_size = 128, lr = 1e-3, training_mode = 'supervised', alpha = 0.03):
        
        self.N = N
        self.J = J
        self.embedding_size = embedding_size
        
        self.build_controler()
        
        self.training_mode = training_mode
        
        if training_mode == 'supervised':
            self.build_supervised()
            with tf.variable_scope('Supervised_optimizer'):
                self.train_op = tf.train.AdamOptimizer(lr).minimize(self.supervised_loss)
                
        elif training_mode == 'reinforce':
            self.alpha = 0.03
            self.build_reinforce()
            with tf.variable_scope('Reinforce_optimizer'):
                self.train_op =  tf.train.RMSPropOptimizer(lr).minimize(self.reinforce_loss)
            
        else:
            raise(ValueError("Unknown training mode: %s" % training_mode))
        
        print("All variables")
        for var in tf.global_variables():
            print(var)
            
        
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        
    def encode(self, inputs):
        
        with tf.variable_scope('Encoder'):
        
            identity_embeddings = tf.get_variable("identity_embeddings",
                                             [self.N, self.embedding_size])
            
            self.embedded_identities = tf.nn.embedding_lookup(identity_embeddings, inputs)
        
            
        return tf.unstack(self.embedded_identities, axis = 1)
    
    def build_f(self, name, h, c, h0 = None):
        
        with tf.variable_scope(name, reuse = tf.AUTO_REUSE):
            
            if h0 is not None:
                
                b1 = tf.get_variable('b1', shape = (1, self.embedding_size))
                W1 = tf.get_variable('W1', shape = (3 * self.embedding_size,
                                                  self.embedding_size))
                
                W2 = tf.get_variable('W2', shape = (self.embedding_size,
                                                  self.embedding_size))
                
                concat = tf.concat([h, c, h0], axis = 1)
            
            else:
                b1 = tf.get_variable('b1', shape = (1, self.embedding_size))
                
                W1 = tf.get_variable('W1', shape = (2 * self.embedding_size,
                                                  self.embedding_size))
                
                W2 = tf.get_variable('W2', shape = (self.embedding_size,
                                                  self.embedding_size))
                
                concat = tf.concat([h, c], axis = 1)
            
            b2 = tf.get_variable('b2', shape = (1, self.embedding_size))
            
            dense1 =tf.nn.relu(tf.einsum("ij,jk->ik", concat, W1) + b1)
            dense2 = tf.nn.relu(tf.einsum("ij,jk->ik", dense1, W2) + b2)
            
            return dense2
        
    def decode(self, h):
        
        with tf.variable_scope('Decoder', reuse = tf.AUTO_REUSE):
            
            W = tf.get_variable('W', shape = (self.embedding_size,
                                                  self.J))
            
            b = tf.get_variable('b', shape = (1, self.J))
            
            policy_logit = tf.einsum("ij,jk->ik", h, W) + b
        
            return policy_logit
    
    
    def communicate(self, h_seq):
        
        return tf.add_n(h_seq) / (self.J - 1)
    
    def sample_actions(self, policy_logit):
        
        action = tf.multinomial(policy_logit, num_samples = 1)
        
        return action
    
        
    def build_controler(self):
        
        self.inputs = tf.placeholder(tf.int32, shape = (None, self.J))
        
        h0_seq = self.encode(self.inputs)
        c0_seq = [self.communicate([h0_seq[j] for j in range(self.J) if j != i]) for i in range(self.J)]
        
        h1_seq = [self.build_f("Comm_step_1", h0_seq[j], c0_seq[j], None) for j in range(self.J)]
        c1_seq = [self.communicate([h1_seq[j] for j in range(self.J) if j != i]) for i in range(self.J)]
        
        h2_seq = [self.build_f("Comm_step_2", h1_seq[j], c1_seq[j], h0_seq[j]) for j in range(self.J)]
        
        self.layers = {'h0_seq': h0_seq, 'c0_seq': c0_seq, 'h1_seq': h1_seq, 'c1_seq':c1_seq, 'h2_seq': h2_seq}
        
        
        self.policy_logit_seq = [self.decode(h2) for h2 in h2_seq]
        
        self.proba_seq = [tf.nn.softmax(policy_logit, axis = 1) for policy_logit in self.policy_logit_seq]
        
        self.action_seq = [self.sample_actions(policy_logit) for policy_logit in self.policy_logit_seq]
        
        self.one_hot_action_seq = [tf.one_hot(action, depth = self.J) for action in self.action_seq]
        
        
        
    def build_supervised(self):
        
        assert self.training_mode == 'supervised', 'Wrong training mode'
        
        self.targets = tf.placeholder(tf.int32, shape = (None, self.J))
        unstacked_targets = tf.unstack(self.targets, axis = 1)
        
        supervised_loss_seq = [tf.nn.sparse_softmax_cross_entropy_with_logits(labels=unstacked_targets[j],
                                                                                   logits=self.policy_logit_seq[j])
                                    for j in range(self.J)]
        
        self.supervised_loss = tf.reduce_mean(supervised_loss_seq)
        
        
    def supervised_train(self, X, y, val_X, val_y, env, batch_size = 32, epochs = 1):
        
        assert self.training_mode == 'supervised', 'Wrong training mode'
        
        n = X.shape[0]
        
        val_n = val_X.shape[0]
        
        data_inds = np.array(range(n))
        for ep in range(1, epochs + 1):
            np.random.shuffle(data_inds)
            supervised_loss_sum = 0
            reward_sum = 0
            for i in tqdm(range(0, n, batch_size), "Epoch: %d" % ep):
                inds_batch = data_inds[i:i+batch_size]
                X_batch = X[inds_batch]
                y_batch = y[inds_batch]
                _, supervised_loss, one_hot_action_seq = sess.run([self.train_op, self.supervised_loss, self.one_hot_action_seq], feed_dict={self.inputs: X_batch, self.targets: y_batch})
                supervised_loss_sum += supervised_loss * batch_size
                reward_sum += env.get_reward(one_hot_action_seq)
            
            print("loss = %f" % (supervised_loss_sum / n))
            print("reward = %f" % (reward_sum / n))
            print()
            
            val_supervised_loss, val_one_hot_action_seq = sess.run([self.supervised_loss, self.one_hot_action_seq], feed_dict={self.inputs: val_X, self.targets: val_y})
            print('val loss = %f' % (val_supervised_loss))
            print('val reward = %f' % (env.get_reward(val_one_hot_action_seq) / val_n))
            

    def build_reinforce(self):
        
        assert self.training_mode == 'reinforce', 'Wrong training mode'
        
        
        self.state_q_val_seq = tf.placeholder(tf.float32, shape = (None, self.J, self.J))
        
        self.reward_sum_values = tf.placeholder(tf.float32, shape = (None,))
        self.advantage_values = tf.placeholder(tf.float32, shape = (None,))
  
        
        self.action_taken = tf.placeholder(tf.int32, shape = (None, self.J))
        unstacked_action_taken = tf.unstack(self.action_taken, axis = 1)
        one_hot_action_taken_seq = [tf.one_hot(action, depth = self.J, dtype = tf.int32) for action in unstacked_action_taken]
        
        self.neg_log_p_seq = [tf.nn.sparse_softmax_cross_entropy_with_logits(labels=unstacked_action_taken[j],
                                                    logits=self.policy_logit_seq[j]) for j in range(self.J)]
        
        neg_log_p_sums = tf.reduce_sum(self.neg_log_p_seq, axis = 0)
        
        
        stacked_proba_seq = tf.stack(self.proba_seq, axis = 1)
        
        baseline = tf.einsum('ijk, ijk-> ij', self.state_q_val_seq, stacked_proba_seq)
        baseline = tf.reduce_mean(baseline, axis = 1)
        
        #surrogate loss (- dtheta)
        advantage = self.reward_sum_values - baseline
    
        self.reinforce_loss =  tf.multiply(neg_log_p_sums, self.advantage_values)
        self.reinforce_loss += self.alpha * tf.square(advantage)
        self.reinforce_loss = tf.reduce_sum(self.reinforce_loss, axis = 0)
        
        
    def take_action(self, state):
        
        assert self.training_mode == 'reinforce', 'Wrong training mode'
        
        action_seq, proba_seq = self.sess.run([self.action_seq, self.proba_seq], {self.inputs: [state]})
        
        return [a[0,0] for a in action_seq], np.array(proba_seq)
    
    def reinforce_train(self, env, n_episodes, T):
        
        assert self.training_mode == 'reinforce', 'Wrong training mode'
        
        
        history = {'reward' : [],  'loss': []}
        
        q_reward_sum = np.zeros((self.N, self.J))
        q_state_action_count = np.zeros((self.N, self.J))
        q_val = np.zeros((self.N, self.J))
    
        
        for _ in tqdm(range(n_episodes), "REINFORCE"):
            
            # todo: change code to avoid this seq_seq name (sequence of sequence)
            state_seq, action_seq, reward_seq, proba_seq = policy_rollout(T, env, commNet)
            episode_len = reward_seq.shape[0]
            
            history['reward'].append(np.mean(reward_seq))
        
            state_q_val_seq = np.array([q_val[state] for state in state_seq])
            
            baseline = np.einsum('ijk, ijk-> ij', state_q_val_seq, proba_seq)
            
            baseline = baseline.mean(axis = 1)
            
            reward_sum_values = np.array([reward_seq[t:].sum() for t in range(episode_len)])
            advantage_values = np.array([reward_sum_values[t] - baseline[t] for t in range(episode_len)])          
            
            
            feed_dict = {}
            feed_dict[self.inputs] = state_seq
            feed_dict[self.state_q_val_seq] = state_q_val_seq
            feed_dict[self.reward_sum_values] = reward_sum_values
            feed_dict[self.advantage_values] = advantage_values
            feed_dict[self.action_taken] = action_seq
            
            _, loss = self.sess.run([self.train_op, self.reinforce_loss], feed_dict = feed_dict)
            
            history['loss'].append(loss)  
            
            # udpate_q_val
            for i in range(episode_len):
                
                state = state_seq[i]
                action = action_seq[i]
                
                cummul_reward = reward_seq[i:].sum()
                q_state_action_count[state, action] += 1
                q_reward_sum[state, action] += cummul_reward
            
            q_val = q_reward_sum/np.maximum(1,q_state_action_count)
            
            
        return history, q_val
            
            
            
            
            
            
            
        

In [38]:
class LeverEnv:
    
    def __init__(self, N, J):
        
        self.J = J
        self.N = N
        
    def reset(self):
        
        state = np.random.choice(self.N, size = self.J, replace = False)
        
        terminal_state = False
        
        return state, terminal_state
    
    def get_reward(self, one_hot_action_seq):        
        
        reward = np.sum(np.sum(one_hot_action_seq, axis = 0) > 0) /self.J
        
        return reward
        
    def step(self, state, action):
        
        next_state = np.random.choice(self.N, size = self.J, replace = False)
        
        one_hot_action_seq = np.zeros((self.J, self.J))
        one_hot_action_seq[range(self.J), action] = 1
        reward = self.get_reward(one_hot_action_seq)
        
        terminal_state = False
        
        return next_state, reward, terminal_state
        

In [59]:
# data generation for supervised learning
def generate_data(n, N, J):
    
    X = np.empty((n, J), dtype = int)
    y= np.empty((n,J), dtype = int)
    
    for i in range(n):
        
        X[i] = np.random.choice(N, size = J, replace = False)
        y[i] = np.argsort(X[i])
        
    return X, y

In [60]:
# episode generation for reinforcement learning
def policy_rollout(T, env, agent):
    
    state_seq = []
    action_seq = []
    reward_seq = []
    proba_seq = []
    
    
    state, terminal_state = env.reset()
    
    t = 0
    
    while not terminal_state and t < T:
        t +=1
        
        state_seq.append(state)
        action, proba = agent.take_action(state)
        
        state, reward, terminal_state = env.step(state, action)
        
        
        action_seq.append(action)
        reward_seq.append(reward)
        proba_seq.append(proba)
        
    return np.array(state_seq), np.array(action_seq), np.array(reward_seq), np.squeeze(np.array(proba_seq))

In [61]:
N = 500
J = 3
batch_size = 32
n = batch_size * 1000

In [62]:
X, y = generate_data(n, N, J)
val_X, val_y = generate_data(500, N, J)

In [63]:
tf.reset_default_graph()
with tf.Session() as sess:
    commNet = CommNet(sess, N, J, lr = 1e-3, embedding_size= 128, training_mode = 'supervised', alpha = 0.1)
    env = LeverEnv(N, J)
    commNet.supervised_train(X, y, val_X, val_y, env, batch_size = batch_size, epochs = 30)
    
    #history, q_val = commNet.reinforce_train(env, n_episodes = 5000, T = 7)
    
    rv = sess.run([commNet.embedded_identities, commNet.layers, commNet.policy_logit_seq, commNet.supervised_loss, commNet.action_seq, commNet.one_hot_action_seq], feed_dict={commNet.inputs: X_val[0:1], commNet.targets: y_val[0:1]})


All variables
<tf.Variable 'Encoder/identity_embeddings:0' shape=(500, 128) dtype=float32_ref>
<tf.Variable 'Comm_step_1/b1:0' shape=(1, 128) dtype=float32_ref>
<tf.Variable 'Comm_step_1/W1:0' shape=(256, 128) dtype=float32_ref>
<tf.Variable 'Comm_step_1/W2:0' shape=(128, 128) dtype=float32_ref>
<tf.Variable 'Comm_step_1/b2:0' shape=(1, 128) dtype=float32_ref>
<tf.Variable 'Comm_step_2/b1:0' shape=(1, 128) dtype=float32_ref>
<tf.Variable 'Comm_step_2/W1:0' shape=(384, 128) dtype=float32_ref>
<tf.Variable 'Comm_step_2/W2:0' shape=(128, 128) dtype=float32_ref>
<tf.Variable 'Comm_step_2/b2:0' shape=(1, 128) dtype=float32_ref>
<tf.Variable 'Decoder/W:0' shape=(128, 3) dtype=float32_ref>
<tf.Variable 'Decoder/b:0' shape=(1, 3) dtype=float32_ref>
<tf.Variable 'Supervised_optimizer/beta1_power:0' shape=() dtype=float32_ref>
<tf.Variable 'Supervised_optimizer/beta2_power:0' shape=() dtype=float32_ref>
<tf.Variable 'Supervised_optimizer/Encoder/identity_embeddings/Adam:0' shape=(500, 128) dtype

Epoch: 1: 100%|███████████████████████████████████████████████████████████████████| 1000/1000 [00:08<00:00, 118.48it/s]


loss = 0.958269
reward = 0.740688

val loss = 0.951322
val reward = 0.750667


Epoch: 2: 100%|███████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 128.28it/s]


loss = 0.912195
reward = 0.753563

val loss = 0.950508
val reward = 0.764667


Epoch: 3: 100%|███████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 132.23it/s]


loss = 0.898543
reward = 0.757719

val loss = 0.940379
val reward = 0.753333


Epoch: 4: 100%|███████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 131.61it/s]


loss = 0.886771
reward = 0.762479

val loss = 0.938874
val reward = 0.749333


Epoch: 5: 100%|███████████████████████████████████████████████████████████████████| 1000/1000 [00:08<00:00, 124.28it/s]


loss = 0.874831
reward = 0.764229

val loss = 0.955544
val reward = 0.771333


Epoch: 6: 100%|███████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 126.54it/s]


loss = 0.857208
reward = 0.769406

val loss = 0.965873
val reward = 0.755333


Epoch: 7: 100%|███████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 130.69it/s]


loss = 0.836705
reward = 0.775125

val loss = 0.973045
val reward = 0.800000


Epoch: 8: 100%|███████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 106.70it/s]


loss = 0.810875
reward = 0.779698

val loss = 0.991745
val reward = 0.783333


Epoch: 9: 100%|███████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 125.64it/s]


loss = 0.777414
reward = 0.785937

val loss = 1.034859
val reward = 0.772667


Epoch: 10: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 128.36it/s]


loss = 0.739056
reward = 0.797583

val loss = 1.062836
val reward = 0.784667


Epoch: 11: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 132.42it/s]


loss = 0.694408
reward = 0.805375

val loss = 1.124752
val reward = 0.791333


Epoch: 12: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 131.90it/s]


loss = 0.644262
reward = 0.817562

val loss = 1.237067
val reward = 0.822667


Epoch: 13: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 132.81it/s]


loss = 0.587624
reward = 0.828833

val loss = 1.340490
val reward = 0.820667


Epoch: 14: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 132.44it/s]


loss = 0.535959
reward = 0.841521

val loss = 1.451961
val reward = 0.818000


Epoch: 15: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 131.64it/s]


loss = 0.476522
reward = 0.853667

val loss = 1.519025
val reward = 0.820667


Epoch: 16: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 132.23it/s]


loss = 0.427613
reward = 0.866781

val loss = 1.635605
val reward = 0.839333


Epoch: 17: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 134.67it/s]


loss = 0.377175
reward = 0.877719

val loss = 1.851481
val reward = 0.857333


Epoch: 18: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 129.89it/s]


loss = 0.334701
reward = 0.889979

val loss = 2.052011
val reward = 0.858000


Epoch: 19: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 131.82it/s]


loss = 0.298881
reward = 0.899573

val loss = 2.231171
val reward = 0.882000


Epoch: 20: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 132.10it/s]


loss = 0.262009
reward = 0.909385

val loss = 2.388979
val reward = 0.879333


Epoch: 21: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 134.59it/s]


loss = 0.237004
reward = 0.917635

val loss = 2.419396
val reward = 0.865333


Epoch: 22: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 131.21it/s]


loss = 0.217600
reward = 0.923271

val loss = 2.615762
val reward = 0.888000


Epoch: 23: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 128.59it/s]


loss = 0.196278
reward = 0.930198

val loss = 2.704153
val reward = 0.894000


Epoch: 24: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 127.26it/s]


loss = 0.178634
reward = 0.936260

val loss = 2.772824
val reward = 0.895333


Epoch: 25: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 126.24it/s]


loss = 0.163033
reward = 0.940198

val loss = 2.858088
val reward = 0.901333


Epoch: 26: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 126.56it/s]


loss = 0.151979
reward = 0.944604

val loss = 2.993100
val reward = 0.900000


Epoch: 27: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 122.32it/s]


loss = 0.142891
reward = 0.948979

val loss = 2.958428
val reward = 0.888000


Epoch: 28: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 128.15it/s]


loss = 0.131046
reward = 0.951573

val loss = 3.236578
val reward = 0.905333


Epoch: 29: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:08<00:00, 122.38it/s]


loss = 0.128227
reward = 0.953313

val loss = 3.284424
val reward = 0.894667


Epoch: 30: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 125.94it/s]


loss = 0.116195
reward = 0.956073

val loss = 3.403419
val reward = 0.898000


In [64]:
rv[0]

array([[[-5.10748150e-03, -3.20997648e-02, -1.04814671e-01,
         -2.27938145e-01, -5.02550155e-02, -4.41683643e-02,
         -4.76616696e-02,  7.95302913e-02,  6.21396936e-02,
         -2.30499320e-02, -1.11377783e-01, -1.75294772e-01,
          9.23990756e-02,  1.61471993e-01, -1.95223987e-01,
         -5.54309152e-02,  2.82358646e-01,  6.98827058e-02,
         -7.15444088e-02,  2.04189450e-01, -5.23562372e-01,
         -1.07234751e-03,  4.69769925e-01,  2.57480323e-01,
          1.97885931e-01,  1.92171976e-01, -7.03487992e-02,
          1.46886647e-01, -2.13827774e-01,  7.34612020e-03,
         -3.16562876e-02, -4.47391905e-03,  1.97891429e-01,
         -3.43837708e-01,  2.40249798e-01,  1.69338614e-01,
          2.11372390e-01,  8.56575519e-02, -5.10145128e-01,
          2.55431861e-01,  2.52566487e-01, -7.67974257e-02,
         -8.91311169e-02,  8.75707865e-02,  3.53021664e-03,
         -1.47335947e-01,  3.47888350e-01,  2.61123657e-01,
         -5.71802892e-02,  2.04764120e-0

In [65]:
for k,v in rv[1].items():
    print()
    print(k)
    print(v)


h0_seq
[array([[-5.10748150e-03, -3.20997648e-02, -1.04814671e-01,
        -2.27938145e-01, -5.02550155e-02, -4.41683643e-02,
        -4.76616696e-02,  7.95302913e-02,  6.21396936e-02,
        -2.30499320e-02, -1.11377783e-01, -1.75294772e-01,
         9.23990756e-02,  1.61471993e-01, -1.95223987e-01,
        -5.54309152e-02,  2.82358646e-01,  6.98827058e-02,
        -7.15444088e-02,  2.04189450e-01, -5.23562372e-01,
        -1.07234751e-03,  4.69769925e-01,  2.57480323e-01,
         1.97885931e-01,  1.92171976e-01, -7.03487992e-02,
         1.46886647e-01, -2.13827774e-01,  7.34612020e-03,
        -3.16562876e-02, -4.47391905e-03,  1.97891429e-01,
        -3.43837708e-01,  2.40249798e-01,  1.69338614e-01,
         2.11372390e-01,  8.56575519e-02, -5.10145128e-01,
         2.55431861e-01,  2.52566487e-01, -7.67974257e-02,
        -8.91311169e-02,  8.75707865e-02,  3.53021664e-03,
        -1.47335947e-01,  3.47888350e-01,  2.61123657e-01,
        -5.71802892e-02,  2.04764120e-02,  1.14

        -0.0366369 , -0.01662268,  0.04092863]], dtype=float32)]

h1_seq
[array([[0.18914533, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.1113388 , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.00630035,
        0.        , 0.        , 0.        , 0.65145993, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.1571116 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.12746245, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.18780066, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.00480404, 0.        

        0.        , 0.28901595, 0.        ]], dtype=float32)]

h2_seq
[array([[0.00729291, 0.18396285, 0.        , 0.        , 0.        ,
        0.        , 0.48683316, 0.27852523, 0.        , 1.2282084 ,
        0.41794506, 0.        , 0.06683498, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.24257477, 0.        ,
        0.        , 0.        , 0.5217738 , 0.        , 0.        ,
        0.        , 0.        , 0.47582847, 0.        , 0.67290944,
        0.3613212 , 0.        , 0.        , 0.        , 0.        ,
        0.35419118, 0.        , 0.        , 0.77474916, 0.2862769 ,
        0.        , 0.7201159 , 0.8947872 , 0.        , 1.5880848 ,
        0.14781564, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.9220301 , 0.        ,
        0.32085794, 0.        , 0.        , 0.        , 2.1368413 ,
        0.        , 1.1573666 , 2.2099125 , 1.0045053 , 1.8304641 ,
        0.8906483 , 0.09131521, 0.        , 0

In [66]:
print(X[0:1])
print(y[0:1])
print(rv[2])

[[260 252 333]]
[[1 0 2]]
[array([[-3.571101 ,  2.017776 ,  1.3937404]], dtype=float32), array([[ 5.146556 , -3.2826242, -2.0817955]], dtype=float32), array([[-4.4597406,  2.2730567,  2.2016068]], dtype=float32)]


In [67]:
for x in rv[2]:
    for xx in x:
        print(np.exp(xx)/np.sum(np.exp(x)))

[0.00242883 0.6495543  0.34801686]
[9.9905670e-01 2.1819446e-04 7.2503177e-04]
[0.00061649 0.5175357  0.48184788]


In [68]:
rv[3]

0.38751313

In [69]:
rv[4]

[array([[1]], dtype=int64),
 array([[0]], dtype=int64),
 array([[2]], dtype=int64)]

In [70]:
rv[5]

[array([[[0., 1., 0.]]], dtype=float32),
 array([[[1., 0., 0.]]], dtype=float32),
 array([[[0., 0., 1.]]], dtype=float32)]

In [71]:
plt.plot(history['reward'], '*')
plt.show()
plt.plot(history['loss'], '*')
plt.show()

NameError: name 'history' is not defined

In [72]:
q_val.mean(axis = 1)

NameError: name 'q_val' is not defined