In [1]:
from modules import *
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
import Utils
import Evaluate
import pickle
import os
from tqdm import tqdm
import logging

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# for train
#embedding_file = "./data/embeddings.pkl"
#train_file = "./data/train.gr.pkl"
#val_file = "./data/dev.gr.pkl"
#evaluate_file = "./data/test.gr.pkl"
embedding_file = "./data/embeddings_ko.pkl"
train_file = "./data/train_ko.pkl"
val_file = "./data/dev_ko.pkl"
evaluate_file = "./data/test_ko.pkl"

save_path = "./model/cpre/"
result_path = "./output/cpre/"
log_path = "./model/cpre/"

max_sentence_len = 50
max_num_utterance = 11
batch_size = 50
eval_batch_size = 100

In [2]:
class ScriptWriter_cpre():
    def __init__(self, eta=0.5):
        self.max_num_utterance = max_num_utterance
        self.negative_samples = 1
        self.max_sentence_len = max_sentence_len
        self.word_embedding_size = 200
        self.hidden_units = 200
        #self.total_words = 43514
        self.total_words = 11883
        self.batch_size = batch_size
        self.eval_batch_size = eval_batch_size
        self.learning_rate_ph = tf.compat.v1.placeholder(tf.float32, shape=[], name='learning_rate')
        self.dropout_rate = 0
        self.num_heads = 1
        self.num_blocks = 3
        self.eta = eta
        self.gamma = tf.compat.v1.get_variable('gamma', shape=1, dtype=tf.float32, trainable=True, initializer=tf.constant_initializer(0.5))

        self.embedding_ph = tf.compat.v1.placeholder(tf.float32, shape=(self.total_words, self.word_embedding_size))
        self.utterance_ph = tf.compat.v1.placeholder(tf.int32, shape=(None, max_num_utterance, max_sentence_len))
        self.response_ph = tf.compat.v1.placeholder(tf.int32, shape=(None, max_sentence_len))
        self.gt_response_ph = tf.compat.v1.placeholder(tf.int32, shape=(None, max_sentence_len))
        self.y_true_ph = tf.compat.v1.placeholder(tf.int32, shape=(None,))
        self.narrative_ph = tf.compat.v1.placeholder(tf.int32, shape=(None, max_sentence_len))

        self.word_embeddings = tf.compat.v1.get_variable('word_embeddings_v', shape=(self.total_words, self.word_embedding_size), dtype=tf.float32, trainable=False)
        self.embedding_init = self.word_embeddings.assign(self.embedding_ph)
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.is_training = True
        print("current eta: ", self.eta)

    def load(self, previous_modelpath):
        #sess = tf.Session()
        sess = tf.compat.v1.Session()
        latest_ckpt = tf.compat.v1.train.latest_checkpoint(previous_modelpath)
        # print("recover from checkpoint: " + latest_ckpt)
        #variables = tf.contrib.framework.get_variables_to_restore()
        #variables = tf.compat.v1.get_variables_to_restore()
        #saver = tf.train.Saver(variables)
        saver = tf.compat.v1.train.Saver()
        saver.restore(sess, latest_ckpt)
        return sess

    def build(self):
        print('utterance_ph.shape =', self.utterance_ph.shape)
        all_utterances = tf.unstack(self.utterance_ph, num=self.max_num_utterance, axis=1)
        print('len(all_utterances) =', len(all_utterances))
        print('len(all_utterances[0].shape =', all_utterances[0].shape)
        reuse = None
        alpha_1, alpha_2 = None, None

        print('response_ph.shape =', self.response_ph.shape)
        response_embeddings = embedding(self.response_ph, initializer=self.word_embeddings)
        print('response_embeddings.shape =', response_embeddings.shape)
        Hr_stack = [response_embeddings]
        for i in range(self.num_blocks):
            with tf.compat.v1.variable_scope("num_blocks_{}".format(i)):
                response_embeddings, _ = multihead_attention(queries=response_embeddings, keys=response_embeddings, num_units=self.hidden_units, num_heads=self.num_heads, is_training=self.is_training, causality=False, dropout_rate=self.dropout_rate)
                print(i, 'response_embeddings.shape =', response_embeddings.shape)
                response_embeddings = feedforward(response_embeddings, num_units=[self.hidden_units, self.hidden_units])
                print(i, 'response_embeddings.shape =', response_embeddings.shape)
                Hr_stack.append(response_embeddings)
        #print('-----------------------------------------')
        #for v in tf.compat.v1.global_variables():
        #    print(v.name)
        #print('-----------------------------------------')

        gt_response_embeddings = embedding(self.gt_response_ph, initializer=self.word_embeddings)
        print('gt_response_embeddings.shape =', gt_response_embeddings.shape)
        Hgtr_stack = [gt_response_embeddings]
        for i in range(self.num_blocks):
            with tf.compat.v1.variable_scope("num_blocks_{}".format(i), reuse=True):
                gt_response_embeddings, _ = multihead_attention(queries=gt_response_embeddings, keys=gt_response_embeddings, num_units=self.hidden_units, num_heads=self.num_heads, is_training=self.is_training, causality=False, dropout_rate=self.dropout_rate)
                gt_response_embeddings = feedforward(gt_response_embeddings, num_units=[self.hidden_units, self.hidden_units])
                Hgtr_stack.append(gt_response_embeddings)
        #print('-----------------------------------------')
        #for v in tf.compat.v1.global_variables():
        #    print(v.name)
        #print('-----------------------------------------')

        narrative_embeddings = embedding(self.narrative_ph, initializer=self.word_embeddings)
        print('narrative_embeddings.shape =', narrative_embeddings.shape)
        Hn_stack = [narrative_embeddings]
        for i in range(self.num_blocks):
            with tf.compat.v1.variable_scope("num_blocks_{}".format(i), reuse=True):
                narrative_embeddings, _ = multihead_attention(queries=narrative_embeddings, keys=narrative_embeddings, num_units=self.hidden_units, num_heads=self.num_heads, is_training=self.is_training, causality=False, dropout_rate=self.dropout_rate)
                narrative_embeddings = feedforward(narrative_embeddings, num_units=[self.hidden_units, self.hidden_units])
                Hn_stack.append(narrative_embeddings)

        Mur, Mun = [], []
        self.decay_factor = []
        last_u_reps = []
        turn_id = 0
        for utterance in all_utterances:
            utterance_embeddings = embedding(utterance, initializer=self.word_embeddings)
            print('utterance_embeddings.shape =', utterance_embeddings.shape)
            Hu_stack = [utterance_embeddings]
            for i in range(self.num_blocks):
                with tf.compat.v1.variable_scope("num_blocks_{}".format(i), reuse=True):
                    utterance_embeddings, _ = multihead_attention(queries=utterance_embeddings, keys=utterance_embeddings, num_units=self.hidden_units, num_heads=self.num_heads, is_training=self.is_training, causality=False, dropout_rate=self.dropout_rate)
                    utterance_embeddings = feedforward(utterance_embeddings, num_units=[self.hidden_units, self.hidden_units])
                    Hu_stack.append(utterance_embeddings)

            if turn_id == self.max_num_utterance - 1:
                last_u_reps = Hu_stack

            # response_attention_uttrance
            r_a_u_stack = []
            # utterance_attention_response
            u_a_r_stack = []

            for i in range(self.num_blocks + 1):
                with tf.compat.v1.variable_scope("utterance_attention_response_{}".format(i), reuse=reuse):
                    u_a_r, _ = multihead_attention(queries=Hu_stack[i], keys=Hr_stack[i], num_units=self.hidden_units, num_heads=self.num_heads, is_training=self.is_training, causality=False, dropout_rate=self.dropout_rate)
                    u_a_r = feedforward(u_a_r, num_units=[self.hidden_units, self.hidden_units])
                    u_a_r_stack.append(u_a_r)
                with tf.compat.v1.variable_scope("response_attention_utterance_{}".format(i), reuse=reuse):
                    r_a_u, _ = multihead_attention(queries=Hr_stack[i], keys=Hu_stack[i], num_units=self.hidden_units, num_heads=self.num_heads, is_training=self.is_training, causality=False, dropout_rate=self.dropout_rate)
                    r_a_u = feedforward(r_a_u, num_units=[self.hidden_units, self.hidden_units])
                    r_a_u_stack.append(r_a_u)
                    
            # concat u_a_r + Hu
            u_a_r_stack.extend(Hu_stack)
            # concat r_a_u + Hr
            r_a_u_stack.extend(Hr_stack)

            n_a_u_stack = []
            u_a_n_stack = []
            for i in range(self.num_blocks + 1):
                with tf.compat.v1.variable_scope("narrative_attention_response_{}".format(i), reuse=reuse):
                    n_a_u, _ = multihead_attention(queries=Hn_stack[i], keys=Hu_stack[i], num_units=self.hidden_units, num_heads=self.num_heads, is_training=self.is_training, causality=False, dropout_rate=self.dropout_rate)
                    n_a_u = feedforward(n_a_u, num_units=[self.hidden_units, self.hidden_units])
                    n_a_u_stack.append(n_a_u)
                with tf.compat.v1.variable_scope("response_attention_narrative_{}".format(i), reuse=reuse):
                    u_a_n, alpha_1 = multihead_attention(queries=Hu_stack[i], keys=Hn_stack[i], num_units=self.hidden_units, num_heads=self.num_heads, is_training=self.is_training, causality=False, dropout_rate=self.dropout_rate)
                    u_a_n = feedforward(u_a_n, num_units=[self.hidden_units, self.hidden_units])
                    u_a_n_stack.append(u_a_n)
            n_a_u_stack.extend(Hn_stack)
            u_a_n_stack.extend(Hu_stack)

            u_a_r = tf.stack(u_a_r_stack, axis=-1)
            r_a_u = tf.stack(r_a_u_stack, axis=-1)
            u_a_n = tf.stack(u_a_n_stack, axis=-1)
            n_a_u = tf.stack(n_a_u_stack, axis=-1)
            
            print('final u_a_r.shape =', u_a_r.shape)

            # similarity matrix 계산
            with tf.compat.v1.variable_scope('similarity'):
                # sim shape [batch, max_sent_len, max_sent_len, 2 * (stack_num + 1)]
                # divide sqrt(200) to prevent gradient explosion
                # no rp의 의미는?
                sim_ur = tf.einsum('biks,bjks->bijs', u_a_r, r_a_u) / tf.sqrt(200.0)  # for no rp and normal
                sim_un = tf.einsum('biks,bjks->bijs', u_a_n, n_a_u) / tf.sqrt(200.0)  # for no rp and normal
            print('sim_ur.shape =', sim_ur.shape)

            self_n = tf.nn.l2_normalize(tf.stack(Hn_stack, axis=-1))  # #for no rp
            self_u = tf.nn.l2_normalize(tf.stack(Hu_stack, axis=-1))  # #for no rp
            Hn_stack_tensor = tf.stack(Hn_stack, axis=-1)  # [batch, o_len, embedding_size, stack]
            print('Hn_stack_tensor.shape =', Hn_stack_tensor.shape)
            with tf.compat.v1.variable_scope('similarity'):
                self_sim = tf.einsum('biks,bjks->bijs', self_u, self_n)  # [batch, u_len, o_len, stack]
                self_sim = 1 - self.gamma * tf.reduce_sum(self_sim, axis=1)  # [batch, (1), o_len, stack]
                Hn_stack = tf.einsum('bjkl,bjl->bjkl', Hn_stack_tensor, self_sim)
                Hn_stack = tf.unstack(Hn_stack, axis=-1, num=self.num_blocks + 1)

            Mur.append(sim_ur)
            Mun.append(sim_un)
            turn_id += 1
            if not reuse:
                reuse = True
        
        print('narrative updated final len(Hn_stack) =', len(Hn_stack), ', Hn_stack[0].shape =', Hn_stack[0].shape)

        print('stack shape = ', tf.stack(Hn_stack, axis=2).shape)
        Hn_stack_for_tracking = tf.compat.v1.layers.dense(tf.stack(Hn_stack, axis=2), self.hidden_units)  # [batch, o_len, stack, embedding_size]
        print('Hn_stack_for_tracking.shape after dense =', Hn_stack_for_tracking.shape)
        Hn_stack_for_tracking = tf.transpose(Hn_stack_for_tracking, perm=[0, 1, 3, 2])  # [batch, o_len, embedding_size, stack]
        Hlastu_stack_for_tracking = tf.stack(last_u_reps, axis=-1)  # [batch, u_len, embedding_size, stack]
        Hr_stack_for_tracking = tf.stack(Hgtr_stack, axis=-1)  # [batch, r_len, embedding_size, stack]
        Hlastu = tf.transpose(Hlastu_stack_for_tracking, perm=[0, 2, 3, 1])
        Hlastu = tf.squeeze(tf.compat.v1.layers.dense(Hlastu, 1), axis=-1)  # [batch, embedding_size, stack]
        p1_tensor = tf.nn.softmax(tf.einsum('bnds,bds->bns', Hn_stack_for_tracking, Hlastu), axis=1)  # [batch, o_len, stack]
        Hlastur = tf.transpose(Hr_stack_for_tracking, perm=[0, 2, 3, 1])
        Hlastur = tf.squeeze(tf.compat.v1.layers.dense(Hlastur, 1), axis=-1)  # [batch, embedding_size, stack]
        p2_tensor = tf.nn.softmax(tf.einsum('bnds,bds->bns', Hn_stack_for_tracking, Hlastur), axis=1)  # [batch, o_len, stack]
        p1 = tf.unstack(p1_tensor, num=self.num_blocks + 1, axis=-1)
        p2 = tf.unstack(p2_tensor, num=self.num_blocks + 1, axis=-1)

        r_a_n_stack = []
        n_a_r_stack = []
        for i in range(self.num_blocks + 1):
            with tf.compat.v1.variable_scope("narrative_attention_response_{}".format(i), reuse=True):
                n_a_r, _ = multihead_attention(queries=Hn_stack[i], keys=Hr_stack[i], num_units=self.hidden_units, num_heads=self.num_heads, is_training=self.is_training, causality=False, dropout_rate=self.dropout_rate)
                n_a_r = feedforward(n_a_r, num_units=[self.hidden_units, self.hidden_units])
                n_a_r_stack.append(n_a_r)
            with tf.compat.v1.variable_scope("response_attention_narrative_{}".format(i), reuse=True):
                r_a_n, _ = multihead_attention(queries=Hr_stack[i], keys=Hn_stack[i], num_units=self.hidden_units, num_heads=self.num_heads, is_training=self.is_training, causality=False, dropout_rate=self.dropout_rate)
                r_a_n = feedforward(r_a_n, num_units=[self.hidden_units, self.hidden_units])
                r_a_n_stack.append(r_a_n)

        n_a_r_stack.extend(Hn_stack)
        r_a_n_stack.extend(Hr_stack)
        n_a_r = tf.stack(n_a_r_stack, axis=-1)
        r_a_n = tf.stack(r_a_n_stack, axis=-1)

        with tf.compat.v1.variable_scope('similarity'):
            Mrn = tf.einsum('biks,bjks->bijs', n_a_r, r_a_n) / tf.sqrt(200.0)
        self.rosim = Mrn
        Mur = tf.stack(Mur, axis=1)
        Mun = tf.stack(Mun, axis=1) 
        print('Mur.shape =', Mur.shape)
        print('Mun.shape =', Mun.shape)
        with tf.compat.v1.variable_scope('cnn_aggregation'):
            conv3d = tf.compat.v1.layers.conv3d(Mur, filters=32, kernel_size=[3, 3, 3], padding="SAME", activation=tf.nn.elu, kernel_initializer=tf.random_uniform_initializer(-0.01, 0.01), name="conv1")
            pool3d = tf.compat.v1.layers.max_pooling3d(conv3d, pool_size=[3, 3, 3], strides=[3, 3, 3], padding="SAME")
            conv3d2 = tf.compat.v1.layers.conv3d(pool3d, filters=32, kernel_size=[3, 3, 3], padding="SAME", activation=tf.nn.elu, kernel_initializer=tf.random_uniform_initializer(-0.01, 0.01), name="conv2")
            pool3d2 = tf.compat.v1.layers.max_pooling3d(conv3d2, pool_size=[3, 3, 3], strides=[3, 3, 3], padding="SAME")
            mur = tf.compat.v1.layers.flatten(pool3d2)
            print('mur conv3d =', conv3d)
            print('mur conv3d2 =', conv3d2)
        with tf.compat.v1.variable_scope('cnn_aggregation', reuse=True):
            conv3d = tf.compat.v1.layers.conv3d(Mun, filters=32, kernel_size=[3, 3, 3], padding="SAME", activation=tf.nn.elu, kernel_initializer=tf.random_uniform_initializer(-0.01, 0.01), name="conv1")
            pool3d = tf.compat.v1.layers.max_pooling3d(conv3d, pool_size=[3, 3, 3], strides=[3, 3, 3], padding="SAME")
            conv3d2 = tf.compat.v1.layers.conv3d(pool3d, filters=32, kernel_size=[3, 3, 3], padding="SAME", activation=tf.nn.elu, kernel_initializer=tf.random_uniform_initializer(-0.01, 0.01), name="conv2")
            pool3d2 = tf.compat.v1.layers.max_pooling3d(conv3d2, pool_size=[3, 3, 3], strides=[3, 3, 3], padding="SAME")
            mun = tf.compat.v1.layers.flatten(pool3d2)
            print('mun conv3d =', conv3d)
            print('mun conv3d2 =', conv3d2)
        with tf.compat.v1.variable_scope('cnn_aggregation'):
            conv2d = tf.compat.v1.layers.conv2d(Mrn, filters=32, kernel_size=[3, 3], padding="SAME", activation=tf.nn.elu, kernel_initializer=tf.random_uniform_initializer(-0.01, 0.01), name="conv2d")
            pool2d = tf.compat.v1.layers.max_pooling2d(conv2d, pool_size=[3, 3], strides=[3, 3], padding="SAME")
            conv2d2 = tf.compat.v1.layers.conv2d(pool2d, filters=32, kernel_size=[3, 3], padding="SAME", activation=tf.nn.elu, kernel_initializer=tf.random_uniform_initializer(-0.01, 0.01), name="conv2d2")
            pool2d2 = tf.compat.v1.layers.max_pooling2d(conv2d2, pool_size=[3, 3], strides=[3, 3], padding="SAME")
            print('mrn conv2d =', conv2d)
            print('mrn conv2d2 =', conv2d2)
            mrn = tf.compat.v1.layers.flatten(pool2d2)

        all_vector = tf.concat([mur, mun, mrn], axis=-1)
        logits = tf.reshape(tf.compat.v1.layers.dense(all_vector, 1, kernel_initializer=tf.compat.v1.orthogonal_initializer()), [-1])

        self.y_pred = tf.sigmoid(logits)
        """
        KL_loss = 0.0
        for i in range(self.num_blocks + 1):
            KL_loss += tf.reduce_mean(tf.keras.losses.kullback_leibler_divergence(p1[i], p2[i]))
        KL_loss /= (self.num_blocks + 1)
        
        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=self.learning_rate_ph, beta1=0.9, beta2=0.98, epsilon=1e-8)
        RS_loss = tf.reduce_mean(tf.clip_by_value(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(self.y_true_ph, tf.float32), logits=logits), -10, 10))
        self.loss = self.eta * RS_loss + (1 - self.eta) * KL_loss
        self.all_variables = tf.compat.v1.global_variables()
        self.grads_and_vars = optimizer.compute_gradients(self.loss)

        for grad, var in self.grads_and_vars:
            if grad is None:
                print(var)

        self.capped_gvs = [(tf.clip_by_value(grad, -5, 5), var) for grad, var in self.grads_and_vars]
        self.train_op = optimizer.apply_gradients(self.capped_gvs, global_step=self.global_step)
        self.saver = tf.compat.v1.train.Saver(max_to_keep=10)
        self.alpha_1 = alpha_1
        """
        # self.alpha_2 = alpha_2
        # self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step)


In [3]:
config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
config.gpu_options.allow_growth = True
with tf.compat.v1.Session(config=config) as sess:
    eta = 0.7
    model = ScriptWriter_cpre(eta)
    model.build()
    
    # The session is binding to the default global graph
    tf.compat.v1.profiler.profile(
        sess.graph,
        options=tf.compat.v1.profiler.ProfileOptionBuilder.float_operation())
    parameters = tf.compat.v1.profiler.profile(sess.graph,
                                     options=tf.compat.v1.profiler.ProfileOptionBuilder
                                     .trainable_variables_parameter())
    print ('total parameters: {}'.format(parameters.total_parameters))

2022-09-22 14:47:08.669608: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-22 14:47:09.227995: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 17608 MB memory:  -> device: 0, name: GeForce RTX 3090, pci bus id: 0000:65:00.0, compute capability: 8.6


current eta:  0.7
utterance_ph.shape = (None, 11, 50)
len(all_utterances) = 11
len(all_utterances[0].shape = (None, 50)
response_ph.shape = (None, 50)
response_embeddings.shape = (None, 50, 200)
0 response_embeddings.shape = (None, 50, 200)
0 response_embeddings.shape = (None, 50, 200)
1 response_embeddings.shape = (None, 50, 200)
1 response_embeddings.shape = (None, 50, 200)
2 response_embeddings.shape = (None, 50, 200)
2 response_embeddings.shape = (None, 50, 200)
gt_response_embeddings.shape = (None, 50, 200)
narrative_embeddings.shape = (None, 50, 200)
utterance_embeddings.shape = (None, 50, 200)
final u_a_r.shape = (None, 50, 200, 8)
sim_ur.shape = (None, 50, 50, 8)
Hn_stack_tensor.shape = (None, 50, 200, 4)
utterance_embeddings.shape = (None, 50, 200)
final u_a_r.shape = (None, 50, 200, 8)
sim_ur.shape = (None, 50, 50, 8)
Hn_stack_tensor.shape = (None, 50, 200, 4)
utterance_embeddings.shape = (None, 50, 200)
final u_a_r.shape = (None, 50, 200, 8)
sim_ur.shape = (None, 50, 50, 8)


7604 ops no flops stats due to incomplete shapes.
Incomplete shape.
Incomplete shape.
Incomplete shape.
Incomplete shape.
Incomplete shape.
Incomplete shape.
Incomplete shape.
Incomplete shape.
Incomplete shape.
Incomplete shape.



-max_depth                  10000
-min_bytes                  0
-min_peak_bytes             0
-min_residual_bytes         0
-min_output_bytes           0
-min_micros                 0
-min_accelerator_micros     0
-min_cpu_micros             0
-min_params                 0
-min_float_ops              1
-min_occurrence             0
-step                       -1
-order_by                   float_ops
-account_type_regexes       .*
-start_name_regexes         .*
-trim_name_regexes          
-show_name_regexes          .*
-hide_name_regexes          
-account_displayed_op_only  true
-select                     float_ops
-output                     stdout:


Doc:
scope: The nodes in the model graph are organized by their names, which is hierarchical like filesystem.
flops: Number of float operations. Note: Please read the implementation for the math behind it.

Profile:
node name | # float_ops
_TFProfRoot (--/6.28m flops)
  word_embeddings_v/Initializer/random_uniform/mul (2.38m/2.38m flo

7604 ops no flops stats due to incomplete shapes.



-max_depth                  10000
-min_bytes                  0
-min_peak_bytes             0
-min_residual_bytes         0
-min_output_bytes           0
-min_micros                 0
-min_accelerator_micros     0
-min_cpu_micros             0
-min_params                 0
-min_float_ops              0
-min_occurrence             0
-step                       -1
-order_by                   name
-account_type_regexes       _trainable_variables
-start_name_regexes         .*
-trim_name_regexes          
-show_name_regexes          .*
-hide_name_regexes          
-account_displayed_op_only  true
-select                     params
-output                     stdout:


Doc:
scope: The nodes in the model graph are organized by their names, which is hierarchical like filesystem.
param: Number of parameters (in the Variable).

Profile:
node name | # parameters
_TFProfRoototal parameters: 3926472


Incomplete shape.
Incomplete shape.
Incomplete shape.
Incomplete shape.
Incomplete shape.
Incomplete shape.
Incomplete shape.
Incomplete shape.
Incomplete shape.
Incomplete shape.


t (--/3.93m params)
  cnn_aggregation (--/46.21k params)
    cnn_aggregation/conv1 (--/6.94k params)
      cnn_aggregation/conv1/bias (32, 32/32 params)
      cnn_aggregation/conv1/kernel (3x3x3x8x32, 6.91k/6.91k params)
    cnn_aggregation/conv2 (--/27.68k params)
      cnn_aggregation/conv2/bias (32, 32/32 params)
      cnn_aggregation/conv2/kernel (3x3x3x32x32, 27.65k/27.65k params)
    cnn_aggregation/conv2d (--/2.34k params)
      cnn_aggregation/conv2d/bias (32, 32/32 params)
      cnn_aggregation/conv2d/kernel (3x3x8x32, 2.30k/2.30k params)
    cnn_aggregation/conv2d2 (--/9.25k params)
      cnn_aggregation/conv2d2/bias (32, 32/32 params)
      cnn_aggregation/conv2d2/kernel (3x3x32x32, 9.22k/9.22k params)
  dense (--/40.20k params)
    dense/bias (200, 200/200 params)
    dense/kernel (200x200, 40.00k/40.00k params)
  dense_1 (--/51 params)
    dense_1/bias (1, 1/1 params)
    dense_1/kernel (50x1, 50/50 params)
  dense_2 (--/51 params)
    dense_2/bias (1, 1/1 params)
    dens

        response_attention_utterance_3/multihead_attention/dense/bias (200, 200/200 params)
        response_attention_utterance_3/multihead_attention/dense/kernel (200x200, 40.00k/40.00k params)
      response_attention_utterance_3/multihead_attention/dense_1 (--/40.20k params)
        response_attention_utterance_3/multihead_attention/dense_1/bias (200, 200/200 params)
        response_attention_utterance_3/multihead_attention/dense_1/kernel (200x200, 40.00k/40.00k params)
      response_attention_utterance_3/multihead_attention/dense_2 (--/40.20k params)
        response_attention_utterance_3/multihead_attention/dense_2/bias (200, 200/200 params)
        response_attention_utterance_3/multihead_attention/dense_2/kernel (200x200, 40.00k/40.00k params)
      response_attention_utterance_3/multihead_attention/multihead_attention (--/400 params)
        response_attention_utterance_3/multihead_attention/multihead_attention/beta (200, 200/200 params)
        response_attention_utterance_

In [23]:
np.random.seed(42)
n_word = 3
n_embedded = 200
n_layer = 4
a = np.random.rand(n_word,n_embedded, n_layer)
b = np.random.rand(n_word,n_embedded, n_layer)
a = tf.constant(a)
b = tf.constant(b)

print(a.shape)
# no axis to normalize
a_norm = tf.nn.l2_normalize(a)
b_norm = tf.nn.l2_normalize(b)
sim = tf.einsum('iks,jks->ijs', a_norm, b_norm)
print(sim.shape)
# error => not a cos value
print(sim)

# correct axis
a_norm = tf.nn.l2_normalize(a, axis=-2)
b_norm = tf.nn.l2_normalize(b, axis=-2)
sim = tf.einsum('iks,jks->ijs', a_norm, b_norm)
print(sim)


(3, 200, 4)
(3, 3, 4)
tf.Tensor(
[[[0.06010767 0.06378155 0.05901452 0.06585608]
  [0.06380126 0.06155514 0.05672619 0.06431459]
  [0.05930824 0.06996205 0.05641049 0.06446883]]

 [[0.06605667 0.06170718 0.06172836 0.06438432]
  [0.06272271 0.05952186 0.0580241  0.0619105 ]
  [0.06559977 0.06484434 0.06050366 0.0632573 ]]

 [[0.06482191 0.06460671 0.0640206  0.06081453]
  [0.06481846 0.06164986 0.06160087 0.06007434]
  [0.06385889 0.06763946 0.05821728 0.05710631]]], shape=(3, 3, 4), dtype=float64)
tf.Tensor(
[[[0.7240082  0.74178509 0.74320613 0.74550174]
  [0.77379084 0.7383667  0.71035131 0.75591322]
  [0.73729438 0.78331064 0.73551692 0.76322202]]

 [[0.74706631 0.73788542 0.7534658  0.76420142]
  [0.71424635 0.73409873 0.70424917 0.76295987]
  [0.76569717 0.74647238 0.76461505 0.7852115 ]]

 [[0.75946848 0.76109677 0.78030435 0.72576216]
  [0.76465834 0.74906433 0.74656952 0.7443645 ]
  [0.77218547 0.76709798 0.73464672 0.71272073]]], shape=(3, 3, 4), dtype=float64)


In [27]:
a = tf.constant([1.1,1.1])
a_norm = tf.nn.l2_normalize(a, axis=-1)
print(a_norm.numpy())
a = tf.constant([[1.0, 1.0], [1.0, 1.0]])
a_norm = tf.nn.l2_normalize(a)
print(a_norm.numpy())
a = tf.constant([[1.1, 1.1], [1.1, 1.1], [2.0, 1.0]])
a_norm = tf.nn.l2_normalize(a, axis=-1)
print(a_norm.numpy())
a = tf.constant([[1.1, 1.1], [1.1, 1.1], [2.0, 1.0]])
a_norm = tf.nn.l2_normalize(a)
print(a_norm.numpy())
a = tf.constant([[[1.0, 1.0], [2.0, 2.0]], [[2.0, 1.0], [4.0, 2.0]]])
print(a.shape)
a_norm = tf.nn.l2_normalize(a, axis=-1)
print(a_norm.numpy())

[0.7071068 0.7071068]
[[0.5 0.5]
 [0.5 0.5]]
[[0.7071068  0.7071068 ]
 [0.7071068  0.7071068 ]
 [0.8944271  0.44721356]]
[[0.35066718 0.35066718]
 [0.35066718 0.35066718]
 [0.6375767  0.31878835]]
(2, 2, 2)
[[[0.7071067  0.7071067 ]
  [0.7071067  0.7071067 ]]

 [[0.8944271  0.44721356]
  [0.8944271  0.44721356]]]


In [33]:
x = tf.constant([[1., 1.], [2., 2.]])
print(tf.reduce_mean(x).numpy())
x = tf.constant([[1., 1.], [2., 2.]])
print(tf.reduce_mean(x, axis=0).numpy())
x = tf.constant([[1., 1.], [2., 2.]])
print(tf.reduce_mean(x, axis=1).numpy())
x = tf.constant([[1., 1.], [2., 2.]])
print(tf.reduce_mean(x, axis=(0,1)).numpy())

1.5
[1.5 1.5]
[1. 2.]
1.5


In [39]:
# KL divergence
np.random.seed(42)
n_batch = 2
n_word = 3
n_embedded = 200
n_layer = 4
q = np.random.rand(n_batch, n_word, n_layer)
p = np.random.rand(n_batch, n_word, n_layer)
q = tf.constant(q)
p = tf.constant(p)
q_tensor = tf.nn.softmax(q, axis=1)
p_tensor = tf.nn.softmax(p, axis=1)
print(q_tensor)
q = tf.unstack(q_tensor, n_layer, axis=-1)
p = tf.unstack(p_tensor, n_layer, axis=-1)
print(q)
KL_loss = 0.0
for i in range(n_layer):
    KL_loss += tf.reduce_mean(tf.keras.losses.kullback_leibler_divergence(p[i], q[i]))
    print(tf.keras.losses.kullback_leibler_divergence(p[i], q[i]))
KL_loss /= (n_layer)

tf.Tensor(
[[[0.32701082 0.44717513 0.49983424 0.26622196]
  [0.26282063 0.20199249 0.25477128 0.34787669]
  [0.41016855 0.35083238 0.24539449 0.38590135]]

 [[0.41811075 0.30335483 0.2940472  0.30169411]
  [0.24654535 0.41460262 0.37760941 0.33604271]
  [0.33534391 0.28204255 0.32834339 0.36226318]]], shape=(2, 3, 4), dtype=float64)
[<tf.Tensor: shape=(2, 3), dtype=float64, numpy=
array([[0.32701082, 0.26282063, 0.41016855],
       [0.41811075, 0.24654535, 0.33534391]])>, <tf.Tensor: shape=(2, 3), dtype=float64, numpy=
array([[0.44717513, 0.20199249, 0.35083238],
       [0.30335483, 0.41460262, 0.28204255]])>, <tf.Tensor: shape=(2, 3), dtype=float64, numpy=
array([[0.49983424, 0.25477128, 0.24539449],
       [0.2940472 , 0.37760941, 0.32834339]])>, <tf.Tensor: shape=(2, 3), dtype=float64, numpy=
array([[0.26622196, 0.34787669, 0.38590135],
       [0.30169411, 0.33604271, 0.36226318]])>]
tf.Tensor([0.07625552 0.00970438], shape=(2,), dtype=float64)
tf.Tensor([0.01844393 0.04050933], sh

In [2]:
# reuse=True 테스트
# 이 코드는 재 실행은 되지 않음. (restart 필요)
import tensorflow as tf

with tf.compat.v1.Session() as sess:
    
    inputs_1 = tf.compat.v1.placeholder(tf.float32, (None, 512, 512, 3), name='inputs_1')
    inputs_2 = tf.compat.v1.placeholder(tf.float32, (None, 512, 512, 3), name='inputs_2')

    with tf.compat.v1.variable_scope('conv'):
        out_1 = tf.compat.v1.layers.conv2d(inputs_1, 32, [3, 3], name='conv_1')

    with tf.compat.v1.variable_scope('conv', reuse=True):
        out_2 = tf.compat.v1.layers.conv2d(inputs_2, 32, [3, 3], name='conv_1')

    init = tf.compat.v1.global_variables_initializer()

    sess.run(init)
    print(tf.compat.v1.trainable_variables())

2022-08-23 10:29:37.487947: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[<tf.Variable 'conv/conv_1/kernel:0' shape=(3, 3, 3, 32) dtype=float32>, <tf.Variable 'conv/conv_1/bias:0' shape=(32,) dtype=float32>]


2022-08-23 10:29:38.304030: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2022-08-23 10:29:38.304091: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1303 MB memory:  -> device: 0, name: GeForce RTX 3090, pci bus id: 0000:65:00.0, compute capability: 8.6


In [7]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
log_path = "./model/cpre/all_log"
logging.basicConfig(filename=log_path, level=logging.INFO)
logger = logging.getLogger(__name__)
eta = 0.7
save_path = "./model/cpre/"
result_path = "./output/cpre/"
if not os.path.exists(save_path):
    os.mkdir(save_path)
if not os.path.exists(result_path):
    os.mkdir(result_path)
logger.info("Current Eta: %.2f" % eta)
train(eta=eta, logger=logger)

2022-08-12 16:44:52.974466: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-08-12 16:44:53.520605: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21469 MB memory:  -> device: 0, name: GeForce RTX 3090, pci bus id: 0000:65:00.0, compute capability: 8.6


Instructions for updating:
This is a deprecated API that should only be used in TF 1 graph mode and legacy TF 2 graph mode available through `tf.compat.v1`. In all other situations -- namely, eager mode and inside `tf.function` -- you can consume dataset elements using `for elem in dataset: ...` or by explicitly creating iterator via `iterator = iter(dataset)` and fetching its elements via `values = next(iterator)`. Furthermore, this API is not available in TF 2. During the transition from TF 1 to TF 2 you can use `tf.compat.v1.data.make_initializable_iterator(dataset)` to create a TF 1 graph mode style iterator for a dataset created through TF 2 APIs. Note that this should be a transient state of your code base as there are in general no guarantees about the interoperability of TF 1 and TF 2 code.
current eta:  0.7





Epoch  1 / 4


  0%|                                                                    | 0/246578 [00:00<?, ?it/s]2022-08-12 16:47:54.177301: I tensorflow/stream_executor/cuda/cuda_blas.cc:1760] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2022-08-12 16:47:54.648735: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8200
 10%|██▏                   | 25000/246578 [13:59<2:04:02, 29.77it/s, learning_rate=0.001, loss=0.27]


IndexError: list index out of range

In [9]:
def evaluate(model_path, eval_file, output_path, eta):
    with open(eval_file, 'rb') as f:
        utterance, response, narrative, gt_response, y_true = pickle.load(f)

    config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True
        
    current_lr = 1e-3
    all_candidate_scores = []
    
    with tf.compat.v1.Session(config=config) as sess:
        dataset = tf.compat.v1.data.Dataset.from_tensor_slices((utterance, narrative, response, gt_response, y_true)).batch(eval_batch_size)
        iterator = dataset.make_initializable_iterator()
        data_iterator = iterator.get_next()

        with open(embedding_file, 'rb') as f:
            embeddings = pickle.load(f)

        model = ScriptWriter_cpre(eta)
        model.build()
        sess = model.load(model_path)
        sess.run(iterator.initializer)
        sess.run(model.embedding_init, feed_dict={model.embedding_ph: embeddings})

        test_loss = 0.0
        step = 0
        try:
            with tqdm(total=len(y_true), ncols=100) as pbar:
                while True:
                    bu, bn, br, bgtr, by = data_iterator
                    bu, bn, br, bgtr, by = sess.run([bu, bn, br, bgtr, by])
                    candidate_scores, loss = sess.run([model.y_pred, model.loss], feed_dict={
                        model.utterance_ph: bu, 
                        model.narrative_ph: bn,
                        model.response_ph: br,
                        model.gt_response_ph: bgtr,
                        model.y_true_ph: by,
                        model.learning_rate_ph: current_lr
                    })
                    all_candidate_scores.append(candidate_scores)
                    test_loss += loss
                    pbar.update(model.eval_batch_size)
                    step += 1
        except tf.errors.OutOfRangeError:
            pass

        #sess.close()
    tf.compat.v1.reset_default_graph()

    all_candidate_scores = np.concatenate(all_candidate_scores, axis=0)
    with open(output_path + "test.result.micro_session.txt", "w") as fw:
        for sc in all_candidate_scores.tolist():
            fw.write(str(sc) + "\n")
    return Evaluate.evaluate_all(all_candidate_scores, y_true), test_loss / step, all_candidate_scores.tolist()


In [11]:
(acc, r2_1, r10_1, r10_2, r10_5, mrr), eva_loss, _ = evaluate(save_path, evaluate_file, output_path=result_path, eta=eta)
print("Loss on test set: %f, R2@1: %f, R10@1: %f, R10@2: %f, R10@5: %f, MRR: %f" % (eva_loss, r2_1, r10_1, r10_2, r10_5, mrr))

2022-08-08 17:35:07.917683: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21469 MB memory:  -> device: 0, name: GeForce RTX 3090, pci bus id: 0000:65:00.0, compute capability: 8.6


current eta:  0.7


2022-08-08 17:36:05.857107: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21469 MB memory:  -> device: 0, name: GeForce RTX 3090, pci bus id: 0000:65:00.0, compute capability: 8.6


INFO:tensorflow:Restoring parameters from ./model/cpre/model


38400it [02:09, 297.55it/s]                                                                         


Loss on test set: 0.458008, R2@1: 0.736691, R10@1: 0.376566, R10@2: 0.539144, R10@5: 0.817067, MRR: 0.494602


In [3]:
with open(evaluate_file, 'rb') as f:
    utterance, response, narrative, gt_response, y_true = pickle.load(f)
    
config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
config.gpu_options.allow_growth = True
        
current_lr = 1e-3
all_candidate_scores = []

eta = 0.7

In [4]:
with tf.compat.v1.Session(config=config) as sess:
    dataset = tf.compat.v1.data.Dataset.from_tensor_slices((utterance, narrative, response, gt_response, y_true)).batch(eval_batch_size)
    iterator = dataset.make_initializable_iterator()
    data_iterator = iterator.get_next()

    with open(embedding_file, 'rb') as f:
        embeddings = pickle.load(f)

    model = ScriptWriter_cpre(eta)
    model.build()
    sess.run(tf.compat.v1.global_variables_initializer())
    sess.run(iterator.initializer)
    sess.run(model.embedding_init, feed_dict={model.embedding_ph: embeddings})
    test_loss = 0.0
    step = 0
    bu, bn, br, bgtr, by = data_iterator
    bu, bn, br, bgtr, by = sess.run([bu, bn, br, bgtr, by])
    candidate_scores, loss = sess.run([model.y_pred, model.loss], feed_dict={
        model.utterance_ph: bu, 
        model.narrative_ph: bn,
        model.response_ph: br,
        model.gt_response_ph: bgtr,
        model.y_true_ph: by,
        model.learning_rate_ph: current_lr
    })
    all_candidate_scores.append(candidate_scores)
    test_loss += loss
    step += 1

2022-08-23 17:46:39.106173: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-08-23 17:46:40.021151: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1377 MB memory:  -> device: 0, name: GeForce RTX 3090, pci bus id: 0000:65:00.0, compute capability: 8.6


Instructions for updating:
This is a deprecated API that should only be used in TF 1 graph mode and legacy TF 2 graph mode available through `tf.compat.v1`. In all other situations -- namely, eager mode and inside `tf.function` -- you can consume dataset elements using `for elem in dataset: ...` or by explicitly creating iterator via `iterator = iter(dataset)` and fetching its elements via `values = next(iterator)`. Furthermore, this API is not available in TF 2. During the transition from TF 1 to TF 2 you can use `tf.compat.v1.data.make_initializable_iterator(dataset)` to create a TF 1 graph mode style iterator for a dataset created through TF 2 APIs. Note that this should be a transient state of your code base as there are in general no guarantees about the interoperability of TF 1 and TF 2 code.
current eta:  0.7
utterance_ph.shape = (None, 11, 50)
len(all_utterances) = 11
len(all_utterances[0].shape = (None, 50)
response_ph.shape = (None, 50)
response_embeddings.shape = (None, 50, 



1 response_embeddings.shape = (None, 50, 200)
1 response_embeddings.shape = (None, 50, 200)
2 response_embeddings.shape = (None, 50, 200)
2 response_embeddings.shape = (None, 50, 200)
gt_response_embeddings.shape = (None, 50, 200)
narrative_embeddings.shape = (None, 50, 200)
utterance_embeddings.shape = (None, 50, 200)
final u_a_r.shape = (None, 50, 200, 8)
Hn_stack_tensor.shape = (None, 50, 200, 4)
utterance_embeddings.shape = (None, 50, 200)
final u_a_r.shape = (None, 50, 200, 8)
Hn_stack_tensor.shape = (None, 50, 200, 4)
utterance_embeddings.shape = (None, 50, 200)
final u_a_r.shape = (None, 50, 200, 8)
Hn_stack_tensor.shape = (None, 50, 200, 4)
utterance_embeddings.shape = (None, 50, 200)
final u_a_r.shape = (None, 50, 200, 8)
Hn_stack_tensor.shape = (None, 50, 200, 4)
utterance_embeddings.shape = (None, 50, 200)
final u_a_r.shape = (None, 50, 200, 8)
Hn_stack_tensor.shape = (None, 50, 200, 4)
utterance_embeddings.shape = (None, 50, 200)
final u_a_r.shape = (None, 50, 200, 8)
Hn_st



mur conv3d = Tensor("cnn_aggregation/conv1/Elu:0", shape=(None, 11, 50, 50, 32), dtype=float32)
mur conv3d2 = Tensor("cnn_aggregation/conv2/Elu:0", shape=(None, 4, 17, 17, 32), dtype=float32)
mun conv3d = Tensor("cnn_aggregation_1/conv1/Elu:0", shape=(None, 11, 50, 50, 32), dtype=float32)
mun conv3d2 = Tensor("cnn_aggregation_1/conv2/Elu:0", shape=(None, 4, 17, 17, 32), dtype=float32)
mrn conv2d = Tensor("cnn_aggregation_2/conv2d/Elu:0", shape=(None, 50, 50, 32), dtype=float32)
mrn conv2d2 = Tensor("cnn_aggregation_2/conv2d2/Elu:0", shape=(None, 17, 17, 32), dtype=float32)


2022-08-23 17:48:05.902012: I tensorflow/stream_executor/cuda/cuda_blas.cc:1760] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2022-08-23 17:48:06.820826: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8200
2022-08-23 17:48:08.026920: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.90GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-08-23 17:48:08.084116: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.90GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-08-23 17:48:08.129985: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of 

2022-08-23 17:48:28.560898: W tensorflow/core/common_runtime/bfc_allocator.cc:457] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.81MiB (rounded to 4000000)requested by op narrative_attention_response_3_5/multihead_attention/dense_1/Tensordot/MatMul
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2022-08-23 17:48:28.560987: I tensorflow/core/common_runtime/bfc_allocator.cc:1004] BFCAllocator dump for GPU_0_bfc
2022-08-23 17:48:28.561013: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (256): 	Total Chunks: 134, Chunks in use: 132. 33.5KiB allocated for chunks. 33.0KiB in use in bin. 1.4KiB client-requested in use in bin.
2022-08-23 17:48:28.561029: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (512): 	Total Chunks: 3, Chunks in use: 1. 1.5KiB allocated for chunks. 512B in use in bin. 400B cl

2022-08-23 17:48:38.604198: W tensorflow/core/common_runtime/bfc_allocator.cc:457] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.81MiB (rounded to 4000000)requested by op narrative_attention_response_3_5/multihead_attention/dense_2/Tensordot/MatMul
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2022-08-23 17:48:38.604298: I tensorflow/core/common_runtime/bfc_allocator.cc:1004] BFCAllocator dump for GPU_0_bfc
2022-08-23 17:48:38.604322: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (256): 	Total Chunks: 134, Chunks in use: 132. 33.5KiB allocated for chunks. 33.0KiB in use in bin. 1.4KiB client-requested in use in bin.
2022-08-23 17:48:38.604336: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (512): 	Total Chunks: 3, Chunks in use: 1. 1.5KiB allocated for chunks. 512B in use in bin. 400B cl

2022-08-23 17:48:48.656217: W tensorflow/core/common_runtime/bfc_allocator.cc:457] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.81MiB (rounded to 4000000)requested by op narrative_attention_response_3_1/multihead_attention/dense_2/Tensordot/MatMul
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2022-08-23 17:48:48.656323: I tensorflow/core/common_runtime/bfc_allocator.cc:1004] BFCAllocator dump for GPU_0_bfc
2022-08-23 17:48:48.656386: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (256): 	Total Chunks: 134, Chunks in use: 132. 33.5KiB allocated for chunks. 33.0KiB in use in bin. 1.4KiB client-requested in use in bin.
2022-08-23 17:48:48.656406: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (512): 	Total Chunks: 3, Chunks in use: 1. 1.5KiB allocated for chunks. 512B in use in bin. 400B cl

ResourceExhaustedError: 2 root error(s) found.
  (0) Resource exhausted: OOM when allocating tensor with shape[5000,200] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node response_attention_narrative_3_5/multihead_attention/dense/Tensordot/MatMul (defined at home/kotech/workspace/ScriptWriter/modules.py:56) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

	 [[Sigmoid/_871]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

  (1) Resource exhausted: OOM when allocating tensor with shape[5000,200] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node response_attention_narrative_3_5/multihead_attention/dense/Tensordot/MatMul (defined at home/kotech/workspace/ScriptWriter/modules.py:56) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

0 successful operations.
0 derived errors ignored.

Original stack trace for 'response_attention_narrative_3_5/multihead_attention/dense/Tensordot/MatMul':
  File "home/kotech/.pyenv/versions/3.8.9/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "home/kotech/.pyenv/versions/3.8.9/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/traitlets/config/application.py", line 846, in launch_instance
    app.start()
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 677, in start
    self.io_loop.start()
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "home/kotech/.pyenv/versions/3.8.9/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
    self._run_once()
  File "home/kotech/.pyenv/versions/3.8.9/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
    handle._run()
  File "home/kotech/.pyenv/versions/3.8.9/lib/python3.8/asyncio/events.py", line 81, in _run
    self._context.run(self._callback, *self._args)
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 457, in dispatch_queue
    await self.process_one()
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 446, in process_one
    await dispatch(*args)
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 353, in dispatch_shell
    await result
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 648, in execute_request
    reply_content = await reply_content
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 345, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/ipykernel/zmqshell.py", line 532, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2898, in run_cell
    result = self._run_cell(
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2944, in _run_cell
    return runner(coro)
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3169, in run_cell_async
    has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3361, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3441, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "tmp/ipykernel_32009/2798287037.py", line 10, in <module>
    model.build()
  File "tmp/ipykernel_32009/3570724548.py", line 127, in build
    u_a_n, alpha_1 = multihead_attention(queries=Hu_stack[i], keys=Hn_stack[i], num_units=self.hidden_units, num_heads=self.num_heads, is_training=self.is_training, causality=False, dropout_rate=self.dropout_rate)
  File "home/kotech/workspace/ScriptWriter/modules.py", line 56, in multihead_attention
    Q = tf.compat.v1.layers.dense(queries, num_units, activation=tf.compat.v1.nn.relu)  # (N, T_q, C)
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/keras/legacy_tf_layers/core.py", line 253, in dense
    return layer.apply(inputs)
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/keras/engine/base_layer_v1.py", line 1679, in apply
    return self.__call__(inputs, *args, **kwargs)
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/keras/legacy_tf_layers/base.py", line 567, in __call__
    outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/keras/engine/base_layer_v1.py", line 765, in __call__
    outputs = call_fn(cast_inputs, *args, **kwargs)
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py", line 692, in wrapper
    return converted_call(f, args, kwargs, options=options)
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py", line 336, in converted_call
    return _call_unconverted(f, args, kwargs, options, False)
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py", line 463, in _call_unconverted
    return f(*args, **kwargs)
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/keras/layers/core.py", line 1232, in call
    outputs = tf.tensordot(inputs, self.kernel, [[rank - 1], [0]])
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py", line 206, in wrapper
    return target(*args, **kwargs)
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/tensorflow/python/ops/math_ops.py", line 5042, in tensordot
    ab_matmul = matmul(a_reshape, b_reshape)
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py", line 206, in wrapper
    return target(*args, **kwargs)
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/tensorflow/python/ops/math_ops.py", line 3654, in matmul
    return gen_math_ops.mat_mul(
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/tensorflow/python/ops/gen_math_ops.py", line 5712, in mat_mul
    _, _, _op, _outputs = _op_def_library._apply_op_helper(
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/tensorflow/python/framework/op_def_library.py", line 748, in _apply_op_helper
    op = g._create_op_internal(op_type_name, inputs, dtypes=None,
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/tensorflow/python/framework/ops.py", line 3561, in _create_op_internal
    ret = Operation(
  File "home/kotech/venv-tensor2/lib/python3.8/site-packages/tensorflow/python/framework/ops.py", line 2045, in __init__
    self._traceback = tf_stack.extract_stack_for_node(self._c_op)


## variable scope 확인

In [1]:
#https://www.tensorflow.org/api_docs/python/tf/compat/v1/variable_scope
import tensorflow as tf
import tensorflow

tf.compat.v1.disable_eager_execution()
with tf.compat.v1.variable_scope('some_scope1'):
    with tf.compat.v1.variable_scope('sub_scope', reuse=None):
        a = tf.compat.v1.Variable(1, name='a')
        b = tf.compat.v1.Variable(2, name='b')
        c = tf.compat.v1.Variable(3, name='c')
        x = tensorflow.keras.layers.Input((5,3))
        y = tf.compat.v1.layers.dense(x, units=3)
        
with tf.compat.v1.variable_scope('some_scope1', reuse=True):
    with tf.compat.v1.variable_scope('sub_scope', reuse=None):
        # variable은 scope가 추가되며 재사용되지 않는다.
        a2 = tf.compat.v1.Variable(1, name='a')
        b2 = tf.compat.v1.Variable(2, name='b')
        c2 = tf.compat.v1.Variable(3, name='c')
        x2 = tensorflow.keras.layers.Input((5,3))
        # layer는 재사용된다.
        y2 = tf.compat.v1.layers.dense(x, units=3)

with tf.compat.v1.variable_scope('some_scope2'):
    d = tf.Variable(4, name='d')
    e = tf.Variable(5, name='e')
    f = tf.Variable(6, name='f')

h = tf.Variable(8, name='h')

for i in tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope='some_scope1'):
#for i in tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope='some_scope1'):
    #print(i)   # i.name if you want just a name
    print(i.name)   # i.name if you want just a name
print()
for v in tf.compat.v1.global_variables():
    print(v.name)

some_scope1/sub_scope/a:0
some_scope1/sub_scope/b:0
some_scope1/sub_scope/c:0
some_scope1/sub_scope/dense/kernel:0
some_scope1/sub_scope/dense/bias:0
some_scope1_1/sub_scope/a:0
some_scope1_1/sub_scope/b:0
some_scope1_1/sub_scope/c:0

some_scope1/sub_scope/a:0
some_scope1/sub_scope/b:0
some_scope1/sub_scope/c:0
some_scope1/sub_scope/dense/kernel:0
some_scope1/sub_scope/dense/bias:0
some_scope1_1/sub_scope/a:0
some_scope1_1/sub_scope/b:0
some_scope1_1/sub_scope/c:0
some_scope2/d:0
some_scope2/e:0
some_scope2/f:0
h:0


