In [1]:
import numpy as np
import tensorflow as tf
from tqdm import tqdm

  '{0}.{1}.{2}'.format(*version.hdf5_built_version_tuple)


In [293]:
class CommNet:
    
    def __init__(self, sess, N, J, embedding_size = 128, lr = 1e-3, training_mode = 'supervised'):
        
        self.N = N
        self.J = J
        self.embedding_size = embedding_size
        
        self.build_controler()
        
        if training_mode == 'supervised':
            self.build_supervised()
            with tf.variable_scope('Supervised_optimizer'):
                self.train_op = tf.train.AdamOptimizer(lr).minimize(self.supervised_loss)
                
        elif training_mode == 'reinforce':
            self.build_reinforce()
            with tf.variable_scope('Reinforce_optimizer'):
                self.train_op =  tf.train.AdamOptimizer(lr).minimize(self.reinforce_loss)
            
        else:
            raise(ValueError("Unknown training mode: %s" % training_mode))
        
        print("All variables")
        for var in tf.global_variables():
            print(var)
            
        
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        
    def encode(self, inputs):
        
        with tf.variable_scope('Encoder'):
        
            identity_embeddings = tf.get_variable("identity_embeddings",
                                             [self.N, self.embedding_size])
            
            embedded_identities = tf.nn.embedding_lookup(identity_embeddings, inputs)
            
        return tf.unstack(embedded_identities, axis = 1)
    
    def build_f(self, name, h, c, h0 = None):
        
        with tf.variable_scope(name, reuse = tf.AUTO_REUSE):
            
            if h0 is not None:
                
                W1 = tf.get_variable('W1', shape = (3 * self.embedding_size,
                                                  self.embedding_size))
                
                concat = tf.concat([h, c, h0], axis = 1)
            
            else:
                W1 = tf.get_variable('W1', shape = (2 * self.embedding_size,
                                                  self.embedding_size))
                
                concat = tf.concat([h, c], axis = 1)
            
            W2 = tf.get_variable('W2', shape = (self.embedding_size,
                                                  self.embedding_size))
            
            dense1 =  tf.nn.relu(tf.einsum("ij,jk->ik", concat, W1))
            dense2 = tf.nn.relu(tf.einsum("ij,jk->ik", dense1, W2))
            
            return dense2
        
    def decode(self, h):
        
        with tf.variable_scope('Decoder', reuse = tf.AUTO_REUSE):
            
            W = tf.get_variable('W', shape = (self.embedding_size,
                                                  self.J))
            
            policy_logits = tf.einsum("ij,jk->ik", h, W)
        
            return policy_logits
    
    
    def communicate(self, h_seq):
        
        return tf.add_n(h_seq) / (self.J - 1)
    
    def sample_actions(self, policy_logit):
        
        
        action = tf.multinomial(policy_logit, num_samples = 1)
        
        return action      
    
        
    def build_controler(self):
        
        self.inputs = tf.placeholder(tf.int32, shape = (None, self.J))
        
        h0_seq = self.encode(self.inputs)
        c0_seq = [self.communicate([h0_seq[j] for j in range(self.J) if j != i]) for i in range(self.J)]
        
        h1_seq = [self.build_f("Comm_step_1", h0_seq[j], c0_seq[j], None) for j in range(self.J)]
        c1_seq = [self.communicate([h1_seq[j] for j in range(self.J) if j != i]) for i in range(self.J)]
        
        h2_seq = [self.build_f("Comm_step_2", h1_seq[j], c1_seq[j], h0_seq[j]) for j in range(self.J)]
        
        
        self.policy_logit_seq = [self.decode(h2) for h2 in h2_seq]
        
        self.action_seq = [self.sample_actions(policy_logit) for policy_logit in self.policy_logit_seq]
        
        self.one_hot_action_seq = [tf.one_hot(tf.reshape(action, [-1]), depth = self.J) for action in self.action_seq]
        
    def build_supervised(self):
        
        self.targets = tf.placeholder(tf.int32, shape = (None, self.J))
        unstacked_targets = tf.unstack(self.targets, axis = 1)
        
        supervised_loss_seq = [tf.nn.sparse_softmax_cross_entropy_with_logits(labels=unstacked_targets[j],
                                                                                   logits=self.policy_logit_seq[j])
                                    for j in range(self.J)]
        
        self.supervised_loss = tf.reduce_sum(tf.add_n(supervised_loss_seq))
        
        
        
    def supervised_train(self, X, y, val_X, val_y, env, batch_size = 32, epochs = 1):
        
        n = X.shape[0]
        
        val_n = val_X.shape[0]
        
        data_inds = np.array(range(n))
        for ep in range(1, epochs + 1):
            np.random.shuffle(data_inds)
            supervised_loss_sum = 0
            reward_sum = 0
            for i in tqdm(range(0, n, batch_size), "Epoch: %d" % ep):
                inds_batch = data_inds[i:i+batch_size]
                X_batch = X[inds_batch]
                y_batch = y[inds_batch]
                _, supervised_loss, one_hot_action_seq = sess.run([self.train_op, self.supervised_loss, self.one_hot_action_seq], feed_dict={self.inputs: X_batch, self.targets: y_batch})
                supervised_loss_sum += supervised_loss
                reward_sum += env.get_reward(one_hot_action_seq)
            
            print("loss = %f" % (supervised_loss_sum / n))
            print("reward = %f" % (reward_sum / n))
            print()
            
            val_supervised_loss, val_one_hot_action_seq = sess.run([self.supervised_loss, self.one_hot_action_seq], feed_dict={self.inputs: val_X, self.targets: val_y})
            print('val loss = %f' % (val_supervised_loss / val_n))
            print('val reward = %f' % (env.get_reward(val_one_hot_action_seq) / val_n))
            

    def build_reinforce(self):
        
        self.log_p_seq = [tf.log(tf.nn.softmax(policy_logit, axis = 1)) for policy_logit in self.policy_logit_seq]
        
        self.advantage = tf.placeholder(tf.float32, shape = (None, self.J))
        unstacked_advantage = tf.unstack(self.advantage, axis = 1)
        
        self.one_hot_action_taken_seq = [tf.placeholder(tf.int32, shape = (None, self.J)) for j in range(self.J)]
        
        # selecting probabilites corresponding to taken action
        self.actions_taken_p_seq = [tf.dynamic_partition(self.log_p_seq[j], self.one_hot_action_taken_seq[j], num_partitions = 2)[1] for j in range(self.J)]
        
        #surrogate loss
        self.reinforce_loss =  - tf.add_n([tf.reduce_sum(tf.multiply(unstacked_advantage[j], self.actions_taken_p_seq[j]), axis = 0) for j in range(self.J)])
        

In [294]:
class LeverEnv:
    
    def __init__(self, N, J):
        
        self.J = J
        self.N = N
        
    def reset(self):
        
        state = np.sort(np.random.choice(self.N, size = self.J, replace = False))
        
        return state        
    
    def get_reward(self, one_hot_action_seq):
        
        reward = np.sum(np.sum(one_hot_action_seq, axis = 0) > 0) /self.J
        
        return reward
        
    def step(self, one_hot_action_seq):
        
        next_state = np.sort(np.random.choice(self.N, size = self.J, replace = False))
        
        reward = self.get_reward(one_hot_action_seq)
        
        return next_state, reward
        

In [291]:
leverEnv = LeverEnv(100, 3)
action = np.random.choice(3, 3)
print("action")
print(action)
print()
one_hot_action_seq = [np.zeros((1, 3)) for _ in range(3)]
for j in range(3):
    one_hot_action_seq[j][0, action[j]] = 1

print("one_hot_action_seq")
print(np.array(one_hot_action_seq))

leverEnv.step(one_hot_action_seq)

action
[2 1 0]

one_hot_action_seq
[[[0. 0. 1.]]

 [[0. 1. 0.]]

 [[1. 0. 0.]]]


(array([60, 64, 67]), 1.0)

In [295]:
def generate_data(n, N, J):
    
    X = np.empty((n, J), dtype = int)
    
    for i in range(n):
        
        X[i] = np.sort(np.random.choice(N, size = J, replace = False))
        
    y = np.tile([j for j in range(J)], (n,1))
    
    return X, y


In [296]:
N = 100
J = 5
batch_size = 32
n = batch_size * 1000

In [297]:
X, y = generate_data(n, N, J)
val_X, val_y = generate_data(1024, N, J)

In [199]:
actions_taken

array([[0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4]])

In [206]:
one_hot_action_taken_seq = [np.zeros((2, J)) for j in range(J)]
print(one_hot_action_taken_seq)
for j in range(J):
    one_hot_action_taken_seq[j][np.arange(2), actions_taken[:,j]] = 1
print(one_hot_action_taken_seq)

[array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]]), array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]]), array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]]), array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]]), array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])]
[array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.]]), array([[0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]]), array([[0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.]]), array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.]]), array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]])]


In [298]:
with tf.Graph().as_default(), tf.Session() as sess:
    
    commNet = CommNet(sess, N, J, lr = 1e-4, training_mode = 'supervised')
    env = LeverEnv(N, J)
    commNet.supervised_train(X, y, val_X, val_y, env, batch_size = batch_size, epochs = 10)
    
    '''advantage = np.tile(np.arange(5), (2, 1))
    action_taken = np.tile(np.arange(5), (2, 1))
    one_hot_action_taken_seq = [np.zeros((2, J)) for j in range(J)]
    for j in range(J):
        one_hot_action_taken_seq[j][np.arange(2), action_taken[:,j]] = 1
        
    feed_dict = {commNet.one_hot_action_taken_seq[j]: one_hot_action_taken_seq[j] for j in range(J)}
    feed_dict[commNet.inputs] = X
    feed_dict[commNet.advantage] = advantage
    
    run_rv = sess.run([commNet.policy_logit_seq, commNet.log_p_seq, commNet.actions_taken_p_seq, commNet.reinforce_loss], feed_dict=feed_dict)
    
    commNet.policy_rollout(10)'''

All variables
<tf.Variable 'Encoder/identity_embeddings:0' shape=(100, 128) dtype=float32_ref>
<tf.Variable 'Comm_step_1/W1:0' shape=(256, 128) dtype=float32_ref>
<tf.Variable 'Comm_step_1/W2:0' shape=(128, 128) dtype=float32_ref>
<tf.Variable 'Comm_step_2/W1:0' shape=(384, 128) dtype=float32_ref>
<tf.Variable 'Comm_step_2/W2:0' shape=(128, 128) dtype=float32_ref>
<tf.Variable 'Decoder/W:0' shape=(128, 5) dtype=float32_ref>
<tf.Variable 'Supervised_optimizer/beta1_power:0' shape=() dtype=float32_ref>
<tf.Variable 'Supervised_optimizer/beta2_power:0' shape=() dtype=float32_ref>
<tf.Variable 'Supervised_optimizer/Encoder/identity_embeddings/Adam:0' shape=(100, 128) dtype=float32_ref>
<tf.Variable 'Supervised_optimizer/Encoder/identity_embeddings/Adam_1:0' shape=(100, 128) dtype=float32_ref>
<tf.Variable 'Supervised_optimizer/Comm_step_1/W1/Adam:0' shape=(256, 128) dtype=float32_ref>
<tf.Variable 'Supervised_optimizer/Comm_step_1/W1/Adam_1:0' shape=(256, 128) dtype=float32_ref>
<tf.Variab

Epoch: 1: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [00:12<00:00, 93.05it/s]


loss = 3.719260
reward = 0.745169

val loss = 1.970813
val reward = 0.806445


Epoch: 2: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [00:11<00:00, 87.32it/s]


loss = 1.592324
reward = 0.834406

val loss = 1.145800
val reward = 0.870117


Epoch: 3: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [00:14<00:00, 69.52it/s]


loss = 0.885830
reward = 0.898219

val loss = 0.629382
val reward = 0.925195


Epoch: 4: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [00:13<00:00, 73.01it/s]


loss = 0.491597
reward = 0.939119

val loss = 0.360993
val reward = 0.955859


Epoch: 5: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [00:13<00:00, 76.77it/s]


loss = 0.284166
reward = 0.963763

val loss = 0.229828
val reward = 0.974414


Epoch: 6:  56%|██████████████████████████████████████▋                              | 561/1000 [00:06<00:05, 87.65it/s]


KeyboardInterrupt: 

In [190]:
print(actions_taken)

[[0 1 2 3 4]
 [0 1 2 3 4]]


In [191]:
print(advantage)

[[0 1 2 3 4]
 [0 1 2 3 4]]


In [208]:
print(run_rv[-1])
for rv in run_rv:
    for x in rv:
        print(x)
    print()

32.02219
[[-0.01041567 -0.02827358  0.05667211  0.04968666 -0.07257875]
 [-0.04856674  0.03890125  0.00929112 -0.00708195 -0.03192247]]
[[-0.02840682  0.05733358  0.01751046 -0.05612933 -0.07831956]
 [-0.07803319 -0.0783726   0.01016312 -0.00312122 -0.09933674]]
[[ 0.02063134 -0.00668805  0.04678989 -0.04575974 -0.09298387]
 [-0.10152197 -0.04478582  0.10595441 -0.04406501 -0.08167294]]
[[-0.0830885  -0.03003656  0.08670028  0.01970931 -0.09213142]
 [ 0.00223263 -0.03788116  0.03150295  0.03245928 -0.03189498]]
[[-0.03944925 -0.05062477  0.05395028 -0.01178153 -0.06881702]
 [ 0.00098358 -0.04350624 -0.00304577 -0.09655475 -0.12675647]]

[[-1.6200542 -1.6379123 -1.5529665 -1.5599519 -1.6822174]
 [-1.6506014 -1.5631334 -1.5927435 -1.6091166 -1.6339571]]
[[-1.6214617 -1.5357214 -1.5755446 -1.6491843 -1.6713744]
 [-1.6387198 -1.6390591 -1.5505235 -1.5638077 -1.6600233]]
[[-1.5744157 -1.6017351 -1.5482572 -1.6408069 -1.688031 ]
 [-1.68048   -1.6237439 -1.4730036 -1.6230232 -1.6606311]]
[[-1

TypeError: 'numpy.float32' object is not iterable