In [2]:
import tensorflow as tf
import numpy as np
import random


class TurnOffLightGame:
    def __init__(self, sen_init, vocabulary, target_word, state_size):
        """
        Parameters
        ----------
        sen_init: list of string
        vocabulary: dictionary
            key: string
            value: int
        target_word: string
        state_size: int
        """
        self.sen = []
        self.vocabulary = vocabulary
        self.actions = {v: k for k, v in self.vocabulary.items()}
        self.actions[len(self.actions)] = "<L>"
        self.actions[len(self.actions)] = "<R>"
        self.state_size = state_size
        
        # set target that need to turn off
        assert target_word in vocabulary
        self.target_word = target_word
        
        # init self.sen
        self.sen.append(vocabulary["<s>"])
        for i, w in enumerate(sen_init):
            if w not in self.vocabulary:
                self.sen.append(vocabulary["<unk>"])
            else:
                self.sen.append(vocabulary[w])
        self.sen.append(vocabulary["</s>"])
        self.cursor = random.randint(1, len(self.sen) - 2)
    
    def is_done(self):
        cur_state = self.get_state()
        for i in range(self.state_size):
            if cur_state[i] == 1:
                return False
        return True

    def get_action_list(self):
        return self.actions
    
    def action_lookup(self, idx):
        return self.actions[idx]
    
    def get_state(self):
        """
        Returns
        -------
        cur_state: numpy, shape=(state_size,), dtype=float
            cur_state[0] == 1 if there are self.target in self.sen[:self.cursor]
            cur_state[1] == 1 if there are self.target in self.sen[self.cursor+1:]
            cur_state[2] == 1 if self.sen[self.curosr] == self.target_word
        """
        cur_state = np.zeros((self.state_size,), dtype=np.float32)
        flag = False
        for i in range(0, self.cursor):
            if self.sen[i] == self.vocabulary[self.target_word]:
                flag = True
        if flag:
            cur_state[0] = 1
        flag = False
        for i in range(self.cursor+1, len(self.sen)):
            if self.sen[i] == self.vocabulary[self.target_word]:
                flag = True
        if flag:
            cur_state[1] = 1
        if self.sen[self.cursor] == self.vocabulary[self.target_word]:
            cur_state[2] = 1
        return cur_state
    
    def apply_action(self, a):
        """
        Parameters
        ----------
        a: int
        Return
        ------
        reward: float
        """
        reward = 0
        action = self.actions[a]
        if action == "<L>":
            self.cursor = max(self.cursor - 1, 1)
        elif action == "<R>":
            self.cursor = min(self.cursor + 1, len(self.sen) - 2)
        else:
            if (self.sen[self.cursor] == self.vocabulary[self.target_word] and
                action != self.target_word):
                reward = 1
            self.sen[self.cursor] = action
        return reward
        
class DQN:
    def __init__(self, dim_in, dim_out, dim_hidden=32, gamma=0.5, l2_alpha=1):
        self.dim_in = dim_in
        self.dim_out = dim_out
        self.dim_hidden = dim_hidden
        self.gamma = gamma
        self.l2_alpha=l2_alpha
        
        ######################
        # Graph Construction #
        ######################
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.x_state = tf.placeholder(tf.float32, shape=(None, self.dim_in,), name="x_state")
            self.y_mask = tf.placeholder(tf.int32, shape=(None, self.dim_out,), name="y_mask")
            self.y_target = tf.placeholder(tf.float32, shape=(None, self.dim_out,), name="y_target")
        
            # hidden layer and output layer
            regularizer = tf.contrib.layers.l2_regularizer(self.l2_alpha)
            hidden_layer = tf.layers.dense(
                self.x_state, dim_hidden,
                kernel_regularizer=regularizer, bias_regularizer=regularizer,name="hidden_layer"
            )
            self.prediction = tf.layers.dense(
                hidden_layer, dim_out,
                kernel_regularizer=regularizer, bias_regularizer=regularizer, name="output_layer"
            )

            loss_l2 = tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
            self.loss = tf.reduce_sum(
                tf.multiply(tf.pow(self.y_target - self.prediction, 2), tf.cast(self.y_mask, tf.float32))
            ) + loss_l2
            
            # Calculate and clip gradients
            params = tf.trainable_variables()
            gradients = tf.gradients(self.loss, params)
            self.clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1)
            # Optimization
            optimizer = tf.train.AdamOptimizer()
            self.op_train = optimizer.apply_gradients(zip(self.clipped_gradients, params))

            # initializer
            self.init = tf.global_variables_initializer()
        
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)
        self.sess = tf.Session(
            graph=self.graph,
            config=tf.ConfigProto(gpu_options=gpu_options)
        )           
        self.sess.run(self.init)
    
    def train(self, x_state, y_mask, y_target):
        """
        Parameters
        ----------
        x_state: numpy, shape=(n_batch, dim_in), dtype=float
        y_mask: numpy, shape=(n_batch, dim_out), dtype=int
        y_target: numpy, shape=(n_batch, dim_out), dtype=float
        Returns
        -------
        loss: numpy, shape=(n_batch, dim_out), dtype=float
        prediction: numpy, shape=(n_batch, dim_out), dtype=float
        """      
        _, loss, prediction = self.sess.run(
            [self.op_train, self.loss, self.prediction],
            feed_dict={
                self.x_state: x_state,
                self.y_mask: y_mask,
                self.y_target: y_target
            }
        )
        
        return loss, prediction
        
    def predict(self, x_state):
        """
        Parameters
        ----------
        x_state: numpy, shape=(n_batch, dim_in), dtype=float
        Returns
        -------
        prediction: numpy, shape=(n_batch, dim_out), dtype=float
        """
        prediction = self.sess.run(
            [self.prediction],
            feed_dict={
                self.x_state: x_state
            }
        )
        
        return prediction

In [6]:
import random

origin_vocabulary = ["0", "1"]
extend_vocabulary = {"<pad>": 0, "<unk>": 1, "<s>": 2, "</s>": 3}
for w in origin_vocabulary:
    extend_vocabulary[w] = len(extend_vocabulary)
    
def generate_batch_data(n_batch, n_max_length, target_word):
    """
    Parameters
    ----------
    n_batch: int
    n_max_length: int
    _word: string
    Returns
    -------
    sen: list of list
        sen[i]: list of string
    """
    batch_data = []
    for i in range(n_batch):
        sen = []
        #for j in range(random.randint(n_max_length // 2, n_max_length)):
        for j in range(random.randint(n_max_length, n_max_length)):
            sen.append("0")
        idx_target = random.randint(0, len(sen)-1)
        sen[idx_target] = target_word
        batch_data.append(sen)
    return batch_data  
    
n_episode = 100
n_max_step = 10000
n_batch_size = 100
n_max_length = 10
n_state_size = 3
dqn_dim_in = n_state_size
dqn_dim_out = len(extend_vocabulary) + 2
dqn_gamma = 0.5
epsilon = 0.9
target_word = "1"

memory_pool = []
n_min_memory_pool_size = 200
n_max_memory_pool_size = 10000
train_sen = generate_batch_data(n_episode, n_max_length-2, target_word)
test_sen = generate_batch_data(10, n_max_length-2, target_word)

dqn = DQN(dqn_dim_in, dqn_dim_out, dim_hidden=32, gamma=dqn_gamma, l2_alpha=.001)

# train on train_sen
for episode in range(n_episode):
    game = TurnOffLightGame(train_sen[episode], extend_vocabulary, target_word, n_state_size)
    for step in range(n_max_step):
        #print("=====training episode {} step {} len(memory) {}=====".format(episode, step, len(memory_pool)))
        if game.is_done():
            break
        cur_state = game.get_state()
        if random.uniform(0, 1) < epsilon:
            actions = game.get_action_list()
            a = random.randint(0, len(actions)-1)
        else:
            batch_state = np.zeros((1, n_state_size), dtype=np.float32)
            batch_state[0, :] = game.get_state()
            a = np.argmax(dqn.predict(batch_state)[0])
        r = game.apply_action(a)
        if r > 0:
            print("episode {} step {} cur_state {} action {} r {}".format(episode, step, cur_state, game.action_lookup(a), r))
        if game.is_done():
            memory_pool.append((cur_state, a, r, game.get_state()))
        else:
            batch_state = np.zeros((1, n_state_size), dtype=np.float32)
            batch_state[0, :] = game.get_state()
            memory_pool.append((cur_state, a, r + dqn_gamma*np.max(dqn.predict(batch_state)[0]), game.get_state()))
        
        if len(memory_pool) >= n_min_memory_pool_size:
            batch_x_state = np.zeros((n_batch_size, n_state_size), dtype=np.float32)
            batch_y_mask = np.zeros((n_batch_size, dqn_dim_out), dtype=np.int32)
            batch_y_target = np.zeros((n_batch_size, dqn_dim_out), dtype=np.float32)
        
            for i, idx in enumerate(random.sample(range(len(memory_pool)), n_batch_size)):
                s, a, r, new_s = memory_pool[idx]
                batch_x_state[i, :] = s
                batch_y_mask[i, a] = 1
                batch_y_target[i, a] = r
            loss, _ = dqn.train(batch_x_state, batch_y_mask, batch_y_target)
    print("=====training episode {} len(memory) {}=====".format(episode, len(memory_pool)))
    if len(memory_pool) > n_min_memory_pool_size:
        random.shuffle(memory_pool)
        memory_pool = memory_pool[-n_max_memory_pool_size-1:-1]
        print("loss", loss)

# evaulate on test_sen
for i in range(len(test_sen)):
    print("=======test_sen[{}]========".format(i))
    print(test_sen[i])
    print(extend_vocabulary)
    game = TurnOffLightGame(test_sen[i], extend_vocabulary, "1", n_state_size)
    for step in range(n_max_step):
        batch_state = np.zeros((1, n_state_size), dtype=np.float32)
        batch_state[0, :] = game.get_state()
        a = np.argmax(dqn.predict(batch_state)[0])
        print("step", step, "cursor", game.cursor, "action", game.action_lookup(a))
        game.apply_action(a)
        print(game.get_state())
        if game.is_done():
            break

episode 0 step 123 cur_state [ 0.  0.  1.] action </s> r 1
=====training episode 0 len(memory) 124=====
episode 1 step 296 cur_state [ 0.  0.  1.] action <unk> r 1
=====training episode 1 len(memory) 421=====
loss 1.71378
episode 2 step 34 cur_state [ 0.  0.  1.] action </s> r 1
=====training episode 2 len(memory) 455=====
loss 0.855086
episode 3 step 33 cur_state [ 0.  0.  1.] action <unk> r 1
=====training episode 3 len(memory) 488=====
loss 1.19058
episode 4 step 48 cur_state [ 0.  0.  1.] action <unk> r 1
=====training episode 4 len(memory) 536=====
loss 0.807829
episode 5 step 105 cur_state [ 0.  0.  1.] action </s> r 1
=====training episode 5 len(memory) 641=====
loss 0.644671
episode 6 step 10 cur_state [ 0.  0.  1.] action <unk> r 1
=====training episode 6 len(memory) 651=====
loss 0.978677
episode 7 step 34 cur_state [ 0.  0.  1.] action </s> r 1
=====training episode 7 len(memory) 685=====
loss 0.979478
episode 8 step 15 cur_state [ 0.  0.  1.] action <unk> r 1
=====training 

episode 75 step 65 cur_state [ 0.  0.  1.] action <pad> r 1
=====training episode 75 len(memory) 2367=====
loss 0.733967
=====training episode 76 len(memory) 2381=====
loss 0.927058
=====training episode 77 len(memory) 2474=====
loss 0.872087
episode 78 step 0 cur_state [ 0.  0.  1.] action 0 r 1
=====training episode 78 len(memory) 2474=====
loss 0.756117
episode 79 step 87 cur_state [ 0.  0.  1.] action <s> r 1
=====training episode 79 len(memory) 2561=====
loss 0.57316
episode 80 step 3 cur_state [ 0.  0.  1.] action </s> r 1
=====training episode 80 len(memory) 2564=====
loss 0.631563
episode 81 step 59 cur_state [ 0.  0.  1.] action </s> r 1
=====training episode 81 len(memory) 2623=====
loss 0.89631
episode 82 step 54 cur_state [ 0.  0.  1.] action </s> r 1
=====training episode 82 len(memory) 2677=====
loss 1.02605
episode 83 step 61 cur_state [ 0.  0.  1.] action </s> r 1
=====training episode 83 len(memory) 2738=====
loss 0.854731
episode 84 step 68 cur_state [ 0.  0.  1.] act

In [4]:
import tensorflow as tf

def Graph1():
    g1 = tf.Graph()
    with g1.as_default() as g:
        matrix1 = tf.constant([[3., 3.]])
        matrix2 = tf.constant([[2.],[2.]])
        product = tf.matmul( matrix1, matrix2, name = "product")

    with tf.Session( graph = g ) as sess:
        tf.initialize_all_variables().run()
        return product


def Graph2(incoming):
    i = incoming
    g2 = tf.Graph()
    with g2.as_default() as g:
        matrix1 = tf.constant([[4., 4.]])
        matrix2 = tf.constant([[5.],[5.]])
        product = tf.matmul( matrix1, matrix2, name = "product" )

    with tf.Session( graph = g ) as sess:
        tf.initialize_all_variables().run()
        print( product)
        print( i)
        print(i == i)
        print(i == product)

print (Graph1())

Graph2(Graph1())

Tensor("product:0", shape=(1, 1), dtype=float32)
Tensor("product:0", shape=(1, 1), dtype=float32)
Tensor("product:0", shape=(1, 1), dtype=float32)
True
False


In [2]:
for k, v in {"a": 3, "b": 5}.items():
    print(k, v)

a 3
b 5
